R Markdown

This is an R Markdown document. Markdown is a Simple formatting syntax for authoring HTML, PDF, and MS Word documents. Html_fragmented is quicker to load than html, hence for this pipeline which genenerated a large mount of data, we have used html_fragment to report Rmarkdown.

Checklist for this pipeline

#1) The present analysis was done on MacOS, using knitToHtmlfragment.

To reuse this code for other datasets a) in this pipeline human gene symbols are used

SVA+LM or SVA+lmY or none and DEG_Limma normalization for study or experiment or batch correction and removal of gender

Step6: Load libraries, set working directory and Import data

#save working directory location
wd<-getwd()
wd
## [1] "C:/Users/SMukherjee/Desktop/VM-share/extra/DEG_NoCuff_CellTypeBrainImmune(notStemCells_6-25-19)/CellType_monocytesBYo_FC5UP"
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
## Loading required package: mgcv
## Warning: package 'mgcv' was built under R version 3.5.3
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 3.5.3
## This is mgcv 1.8-28. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(pamr)
## Warning: package 'pamr' was built under R version 3.5.3
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.5.3
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.5.3
library(ggplot2) # plot
## Warning: package 'ggplot2' was built under R version 3.5.3
library(ggmap)#plots
## Warning: package 'ggmap' was built under R version 3.5.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(gplots)#plots
## Warning: package 'gplots' was built under R version 3.5.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
## Warning: package 'Hmisc' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(viridis)#for corelation plot
## Warning: package 'viridis' was built under R version 3.5.3
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 3.5.3
library(corrplot)#for corelation plot
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(filesstrings)#for file organization
## Warning: package 'filesstrings' was built under R version 3.5.3
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 3.5.3
library(expss)#for handling NAs
## Warning: package 'expss' was built under R version 3.5.3
## 
## Attaching package: 'expss'
## The following objects are masked from 'package:stringr':
## 
##     fixed, regex
## The following object is masked from 'package:ggplot2':
## 
##     vars
#Metadata
metadata=read.csv("Step5D_result_merge_GSE52564_to_SRP035309_metadata_SampleID.csv", header=T, sep=',')

#Expression data
Expr_GeneMs=read.csv("Step5D_result_merge_GSE52564_to_SRP035309_Expr_Gene_SampleID.csv", header =T, sep=',')
#Rename the gene symbol column from X to GeneMs
colnames(Expr_GeneMs)[colnames(Expr_GeneMs) == 'X'] <- 'GeneMs'
dim(metadata)
## [1] 30  9
dim(Expr_GeneMs)
## [1] 22123    31
#Need to make sure that the Gene IDs column labeled as GeneMs are unique to specify rownames with Gene IDs
Expr_GeneMs = Expr_GeneMs[!duplicated(Expr_GeneMs$GeneMs),]
dim(metadata)
## [1] 30  9
dim(Expr_GeneMs)
## [1] 22123    31
#We do not lose Gene IDs in RNA-seq (has no duplicates) unline microarray (has duplicates)
Expr_GeneMs_1=Expr_GeneMs[,-1] #get rid of extra column
rownames(Expr_GeneMs_1)=Expr_GeneMs$GeneMs 
Expr_GeneMs_1=Expr_GeneMs_1[complete.cases(Expr_GeneMs_1), ]
edata=Expr_GeneMs_1
#Optional filter low counts or else svobj step compalins about missing values and all zero rows
#Expr_GeneMs_2=Expr_GeneMs_1[rowSums(Expr_GeneMs_1)>0,]
#edata=Expr_GeneMs_2
#From here identify column numbers of interest for next step
colnames(metadata)
## [1] "X"                   "Sample_Name"         "MsGroup_Name"       
## [4] "CellType"            "Age"                 "Study"              
## [7] "Species"             "Tissue"              "CellTypeDescription"
metadata_1=metadata[,c("CellType","Age","Study","Species","Tissue")]
#metadata_1=metadata[,4:8]
rownames(metadata_1)=metadata$Sample_Name
#View(metadata_1)
colnames(metadata_1)
## [1] "CellType" "Age"      "Study"    "Species"  "Tissue"
#only keep the variables that will be considered in the mod, mod null and linear regression
#metadata_2=metadata_1[,1:5]
metadata_2=metadata_1[,c("CellType","Age","Study","Species","Tissue")]
rownames(metadata_2)=rownames(metadata_1)
#Optional way to drop metadata pheno trait column. Instead of dropping metadata pheno trait column in this pipeline we only provide traits of interest in the model.matrix steps
#metadata_2=metadata_2[!names(metadata_2) %in% c("Tissue")]
pheno=metadata_2
#View(metadata_2)
#View(pheno)
colnames(pheno)
## [1] "CellType" "Age"      "Study"    "Species"  "Tissue"
DT::datatable(pheno)
DT::datatable(edata[1:7,1:7])
dim(metadata)
## [1] 30  9
dim(pheno)
## [1] 30  5
dim(Expr_GeneMs) #dropped off 1 extra column
## [1] 22123    31
dim(edata)
## [1] 22123    30
#Make sure we did not lose any samples in expression data
#Here in the output the 1st two and the last two should be same i.e. the first and last samples are same
colnames(Expr_GeneMs[2])
## [1] "astro_SRR1033783"
colnames(edata[1])
## [1] "astro_SRR1033783"
colnames(Expr_GeneMs[22])
## [1] "monocytes_SRR5483451"
colnames(edata[21])
## [1] "monocytes_SRR5483451"

Step7: Preparing aggregate data expr or edata from metadata by variables

t_edata<-t(edata)
DT::datatable(t_edata[,1:7])
DT::datatable(pheno)
dim(t_edata)
## [1]    30 22123
dim(pheno)
## [1] 30  5
#bind the pheno and edata, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata<-cbind(pheno, t_edata)
dim(pheno_t_edata)
## [1]    30 22128
DT::datatable(pheno_t_edata[,1:10])
write.csv(pheno_t_edata, "Step7_result_pheno_t_edata_All_Group_aggregate.csv")

Aggregade by CellType

#select the CellType pheno drop others
pheno_t_edata_CellType=pheno_t_edata[!names(pheno_t_edata) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_CellType)
## [1]    30 22124
DT::datatable(pheno_t_edata_CellType[,1:10])
#aggregate edata or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:expss':
## 
##     copy
setDT(pheno_t_edata_CellType)
pheno_t_edata_CellType_avg=pheno_t_edata_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_CellType_avg=t(pheno_t_edata_CellType_avg[,-1])
colnames(t_pheno_t_edata_CellType_avg)=pheno_t_edata_CellType_avg$CellType
dim(t_pheno_t_edata_CellType_avg)
## [1] 22123     2
DT::datatable(t_pheno_t_edata_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_CellType_avg, "Step7_result_edata_CellType_aggregate.csv")

Aggregade by Species

#select the Species pheno drop others
pheno_t_edata_Species=pheno_t_edata[!names(pheno_t_edata) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_Species)
## [1]    30 22124
DT::datatable(pheno_t_edata_Species[,1:10])
#aggregate edata or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Species)
pheno_t_edata_Species_avg=pheno_t_edata_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Species_avg=t(pheno_t_edata_Species_avg[,-1])
colnames(t_pheno_t_edata_Species_avg)=pheno_t_edata_Species_avg$Species
dim(t_pheno_t_edata_Species_avg)
## [1] 22123     2
DT::datatable(t_pheno_t_edata_Species_avg[1:10,])
write.csv(t_pheno_t_edata_Species_avg, "Step7_result_edata_Species_aggregate.csv")

Aggregade by Age

#select the Age pheno drop others
pheno_t_edata_Age=pheno_t_edata[!names(pheno_t_edata) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_Age)
## [1]    30 22124
DT::datatable(pheno_t_edata_Age[,1:10])
#aggregate edata or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Age)
pheno_t_edata_Age_avg=pheno_t_edata_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Age_avg=t(pheno_t_edata_Age_avg[,-1])
colnames(t_pheno_t_edata_Age_avg)=pheno_t_edata_Age_avg$Age
dim(t_pheno_t_edata_Age_avg)
## [1] 22123     2
DT::datatable(t_pheno_t_edata_Age_avg[1:10,])
write.csv(t_pheno_t_edata_Age_avg, "Step7_result_edata_Age_aggregate.csv")

Aggregade by Tissue

#select the Tissue pheno drop others
pheno_t_edata_Tissue=pheno_t_edata[!names(pheno_t_edata) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_Tissue)
## [1]    30 22124
DT::datatable(pheno_t_edata_Tissue[,1:10])
#aggregate edata or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_Tissue)
pheno_t_edata_Tissue_avg=pheno_t_edata_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_Tissue_avg=t(pheno_t_edata_Tissue_avg[,-1])
colnames(t_pheno_t_edata_Tissue_avg)=pheno_t_edata_Tissue_avg$Tissue
dim(t_pheno_t_edata_Tissue_avg)
## [1] 22123     4
DT::datatable(t_pheno_t_edata_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_Tissue_avg, "Step7_result_edata_Tissue_aggregate.csv")

Aggregade all Groups

t_pheno_t_edata_all_avg<-cbind(t_pheno_t_edata_CellType_avg, t_pheno_t_edata_Species_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_all_avg))
write.csv(t_pheno_t_edata_all_avg, "Step7_result_edata_All_Group_aggregate.csv")
#save pheno_t_edata, t_pheno_t_edata_CellType_avg, t_pheno_t_edata_Species_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg, t_pheno_t_edata_all_avg
save(pheno_t_edata, t_pheno_t_edata_CellType_avg, t_pheno_t_edata_Species_avg, t_pheno_t_edata_Age_avg, t_pheno_t_edata_Tissue_avg, t_pheno_t_edata_all_avg, file="edata_aggregate_data.RData")

Step7: Visualization by heatmap and correlation of aggregate data expr or edata from metadata by variables

pdf(file="Step7_plot_boxplot_corr_plot_edata_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))

                    #########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_CellType_avg", col="yellow")

#box plot on aggregate Species
boxplot(t_pheno_t_edata_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Species_avg", col="yellow")

#box plot on aggregate Age
boxplot(t_pheno_t_edata_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Age_avg", col="yellow")

#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_Tissue_avg", col="yellow")

#box plot on aggregate all
boxplot(t_pheno_t_edata_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_all_avg", col="yellow")

                    #########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_CellType_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Species_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Age_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_Tissue_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step7_result_edata_all_Group_corr_pval_aggregate.csv")

dev.off()
## png 
##   2
pdf(file="Step7_plot_density_edata_aka_expr_aggregate.pdf",height=10,width=10)

                    #########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_CellType_avg", y = "Density")


#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Species",x="t_pheno_t_edata_Species_avg", y = "Density")
  

#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Age",x="t_pheno_t_edata_Age_avg", y = "Density")

#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_Tissue_avg", y = "Density")

#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for all",x="t_pheno_t_edata_all_avg", y = "Density") 

dev.off()
## png 
##   2

Step7: Visualization of edata pca_unscaled filter

SVA none and pca_unscaled

edata_pca_unscaled filtered to pca_unscaled genes

edata_pca_unscaled=edata
#Summary Statistics edata_pca_unscaled
DT::datatable(summary(edata_pca_unscaled[1:7,1:7]))
write.csv(summary(edata_pca_unscaled),"Step7_result_summary_stats_edata_pca_unscaled.csv")
pdf(file="Step7_plot_Visualization_boxplot_MDS_PC_pca_unscaled_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))

                            ############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]

                            ############boxplot chunk###############
#Before boxplot edata_pca_unscaled
par(mai=c(1,0.8,1,0.8))
boxplot(edata_pca_unscaled, outline=FALSE, las=2, cex=0.25, main="edata_pca_unscaled", col="yellow")

                            ############MDSplot chunk###############
#Before MDSplot edata_pca_unscaled
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_pca_unscaled, col=condition_colors, legend= "all", main="edata_pca_unscaled", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
                    #######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_pca_unscaled, colored by Study or batch, labels CellType
#pca0=prcomp(t(edata_pca_unscaled), center=T, scale =T)
pca0=prcomp(t(edata_pca_unscaled))
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -189.88574 -31.84178 -53.95528
## astro_SRR1033784 -193.28961 -34.46141 -60.07396
## endo_SRR1033795   -93.02506  47.94031  52.68268
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_pca_unscaled",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$CellType,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                             #########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_pca_unscaled, colored by Study or batch, labels Species
#pca0=prcomp(t(edata_pca_unscaled), center=T, scale=T)
pca0=prcomp(t(edata_pca_unscaled))
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -189.88574 -31.84178 -53.95528
## astro_SRR1033784 -193.28961 -34.46141 -60.07396
## endo_SRR1033795   -93.02506  47.94031  52.68268
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_pca_unscaled",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Species,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                                 #########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_pca_unscaled, colored by Study or batch, labels Age
#pca0=prcomp(t(edata_pca_unscaled), center=T, scale=T)
pca0=prcomp(t(edata_pca_unscaled))
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -189.88574 -31.84178 -53.95528
## astro_SRR1033784 -193.28961 -34.46141 -60.07396
## endo_SRR1033795   -93.02506  47.94031  52.68268
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_pca_unscaled",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Age,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

dev.off()
## png 
##   2
save.image(file="done_pre_Step8.RData")

Step8: Preparing models for SVA (Surrogate variable adjustment)

See SVAlm pipelines

Step9: Doing SVA (Surrogate variable adjustment)

See SVAlm pipelines

Step10: Adjust edata based on SVA (using LM or cleanY)

See SVAlm pipelines

Step10: Visualization of before and after SVA lmY

See SVAlm pipelines

Step11: Preparing aggregate data edata_lmY from metadata by variables

See SVAlm pipelines

Step11: Visualization by heatmap and correlation of aggregate data logtpm or edata_lmY from metadata by variables

See SVAlm pipelines

See DEG pipeline, uses edata and modified metadata

Step12: Differentially expressed genes DEG_Limma limma method

#Load pheno, edata and edata_combatY required for steps below
load(file="done_pre_Step8.RData")
gc() 
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4131120 220.7    8073370 431.2  8073370 431.2
## Vcells 17020959 129.9   29333001 223.8 21625243 165.0
#load required libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggrepel) #Avoid overlapping labels
## Warning: package 'ggrepel' was built under R version 3.5.3
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:expss':
## 
##     between, compute, first, last, na_if, recode, vars
## The following object is masked from 'package:filesstrings':
## 
##     all_equal
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Input data should be log2 as per limma http://web.mit.edu/~r/current/arch/i386_linux26/lib/R/library/limma/html/lmFit.html

#From here identify column numbers of interest for next step
colnames(pheno)
## [1] "CellType" "Age"      "Study"    "Species"  "Tissue"
#Note here we use the generic DEG_Limma differential expression calculation step
#mod only includes the variables of interest and covariants, not surrogate variables
#https://www.rdocumentation.org/packages/limma/versions/3.28.14/topics/lmFit
#fit_DEG_Limma=lmFit(edata,mod)

mod_DEG_Limma = model.matrix(~0+CellType+Study, data=pheno) 

fit_DEG_Limma=lmFit(edata,mod_DEG_Limma)
summary(fit_DEG_Limma)
##                  Length Class  Mode     
## coefficients     88492  -none- numeric  
## rank                 1  -none- numeric  
## assign               4  -none- numeric  
## qr                   5  qr     list     
## df.residual      22123  -none- numeric  
## sigma            22123  -none- numeric  
## cov.coefficients    16  -none- numeric  
## stdev.unscaled   88492  -none- numeric  
## pivot                4  -none- numeric  
## Amean            22123  -none- numeric  
## method               1  -none- character
## design             120  -none- numeric
DT::datatable(fit_DEG_Limma$coefficients)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#Coefficients if not estimable i.e. gives NAs in coefficient then remove. Try to use the max number of variables
#mod_DEG_Limma = model.matrix(~0+CellType+Study, data=pheno)
#mod_DEG_Limma = model.matrix(~0+CellType+Study+Species+Tissue+Age, data=pheno)
#mod_DEG_Limma = model.matrix(~0+CellType+Species+Age, data=pheno)
#Use the values in the creating_CellType_contrasts as the first three values of contrast matrix for CellType 
design_DEG_Limma<-model.matrix(~0+CellType, data=pheno)
colnames(design_DEG_Limma)
## [1] "CellTypemonocytes" "CellTypeother"
design_DEG_Limma[1:3,]
##                  CellTypemonocytes CellTypeother
## astro_SRR1033783                 0             1
## astro_SRR1033784                 0             1
## endo_SRR1033795                  0             1
DT::datatable(design_DEG_Limma)
#comparisons you want write here
creating_CellType_contrasts_DEG_Limma<-makeContrasts(monocytesBYo=CellTypemonocytes-CellTypeother, levels =design_DEG_Limma)
creating_CellType_contrasts_DEG_Limma
##                    Contrasts
## Levels              monocytesBYo
##   CellTypemonocytes            1
##   CellTypeother               -1
#Create contrasts to find the differentially expressed genes .
#We hold other variables and surrogate variables at 0 (NA since we are using combat)
#Here the net number of rows is same and correspond to columns in fit_DEG_Limma$coefficients
contrast.matrix_DEG_Limma <- cbind("monocytesBYo"=c(1,-1,rep(0,2)))
contrast.matrix_DEG_Limma
##      monocytesBYo
## [1,]            1
## [2,]           -1
## [3,]            0
## [4,]            0
#Identification of differentially expressed genes for the contrasts
fitContrasts_DEG_Limma=contrasts.fit(fit_DEG_Limma, contrast.matrix_DEG_Limma)

#calculate empirical Bayes variabce 
eb_DEG_Limma=eBayes(fitContrasts_DEG_Limma)
#top table and results table for differentially expressed genes for comparison of all levels. This is different from the pairwise contrast comparison
result_tT_full_DEG_Limma<- topTableF(eb_DEG_Limma, adjust="BH", n=nrow(edata))
result_tT_FC5.00_DEG_Limma <- result_tT_full_DEG_Limma[which(result_tT_full_DEG_Limma$adj.P.Val <= 0.05), ]

dim(result_tT_full_DEG_Limma)
## [1] 22123     5
dim(result_tT_FC5.00_DEG_Limma)
## [1] 5355    5
write.csv(result_tT_full_DEG_Limma, "Step12_result_tT_full_DEG_Limma.csv")
write.csv(result_tT_FC5.00_DEG_Limma, "Step12_result_tT_FC5.00_DEG_Limma.csv")
#DEG_Limma for contrast "monocytesBYo"=CellTypemonocytes-CellTypeother 
result_monocytesBYo_full_DEG_Limma<- topTable(eb_DEG_Limma, number = nrow(edata), adjust = "BH", coef="monocytesBYo")
result_monocytesBYo_full_DEG_Limma$Contrast<-rep("monocytesBYo",nrow(result_monocytesBYo_full_DEG_Limma))
result_monocytesBYo_full_DEG_Limma$FC<- 2^(result_monocytesBYo_full_DEG_Limma$logFC)
result_monocytesBYo_FC5.00_DEG_Limma<- result_monocytesBYo_full_DEG_Limma[which(result_monocytesBYo_full_DEG_Limma$FC >= 5.00), ]

dim(result_monocytesBYo_full_DEG_Limma)
## [1] 22123     8
dim(result_monocytesBYo_FC5.00_DEG_Limma)
## [1] 856   8
write.csv(result_monocytesBYo_full_DEG_Limma, "Step12_result_monocytesBYo_full_DEG_Limma.csv")
write.csv(result_monocytesBYo_FC5.00_DEG_Limma, "Step12_result_monocytesBYo_FC5.00_DEG_Limma.csv")

summary(result_monocytesBYo_full_DEG_Limma)
##      logFC             AveExpr               t           
##  Min.   :-9.57945   Min.   : 0.06945   Min.   :-36.3159  
##  1st Qu.:-0.53569   1st Qu.: 0.44096   1st Qu.: -1.0703  
##  Median : 0.08722   Median : 1.36609   Median :  0.1694  
##  Mean   : 0.05901   Mean   : 1.95727   Mean   :  0.3854  
##  3rd Qu.: 0.61965   3rd Qu.: 3.00784   3rd Qu.:  1.5605  
##  Max.   :12.79280   Max.   :13.10800   Max.   : 52.7903  
##     P.Value          adj.P.Val             B            Contrast        
##  Min.   :0.00000   Min.   :0.00000   Min.   :-6.965   Length:22123      
##  1st Qu.:0.01409   1st Qu.:0.05633   1st Qu.:-6.834   Class :character  
##  Median :0.20482   Median :0.40962   Median :-6.126   Mode  :character  
##  Mean   :0.33049   Mean   :0.44496   Mean   :-4.070                     
##  3rd Qu.:0.61639   3rd Qu.:0.82184   3rd Qu.:-3.823                     
##  Max.   :1.00000   Max.   :1.00000   Max.   :51.772                     
##        FC          
##  Min.   :   0.001  
##  1st Qu.:   0.690  
##  Median :   1.062  
##  Mean   :   2.960  
##  3rd Qu.:   1.537  
##  Max.   :7096.028
summary(result_monocytesBYo_FC5.00_DEG_Limma)
##      logFC           AveExpr              t             P.Value         
##  Min.   : 2.328   Min.   : 0.3994   Min.   : 1.666   Min.   :0.000e+00  
##  1st Qu.: 2.778   1st Qu.: 0.8731   1st Qu.: 4.269   1st Qu.:0.000e+00  
##  Median : 3.409   Median : 1.6458   Median : 6.444   Median :5.100e-07  
##  Mean   : 3.921   Mean   : 2.0812   Mean   : 8.532   Mean   :2.357e-03  
##  3rd Qu.: 4.611   3rd Qu.: 2.7058   3rd Qu.:10.488   3rd Qu.:1.967e-04  
##  Max.   :12.793   Max.   :10.2528   Max.   :52.790   Max.   :1.067e-01  
##    adj.P.Val               B             Contrast        
##  Min.   :0.000e+00   Min.   :-5.6079   Length:856        
##  1st Qu.:0.000e+00   1st Qu.: 0.2477   Class :character  
##  Median :1.069e-05   Median : 6.1149   Mode  :character  
##  Mean   :8.119e-03   Mean   : 9.3597                     
##  3rd Qu.:1.720e-03   3rd Qu.:15.8859                     
##  Max.   :2.537e-01   Max.   :51.7721                     
##        FC          
##  Min.   :   5.020  
##  1st Qu.:   6.861  
##  Median :  10.619  
##  Mean   :  47.166  
##  3rd Qu.:  24.430  
##  Max.   :7096.028
#Bind all differentally expressed full genes into a single table
result_allContrasts_full_DEG_Limma<-rbind(result_monocytesBYo_full_DEG_Limma) #here only one table so rbind not required
dim(result_allContrasts_full_DEG_Limma)
## [1] 22123     8
write.csv(result_allContrasts_full_DEG_Limma, "Step12_result_allContrasts_full_DEG_Limma.csv")

#Bind all differentally expressed pval genes into a single table
result_allContrasts_FC5.00_DEG_Limma<-rbind(result_monocytesBYo_FC5.00_DEG_Limma) #here only one table so rbind not required
dim(result_allContrasts_FC5.00_DEG_Limma)
## [1] 856   8
write.csv(result_allContrasts_FC5.00_DEG_Limma, "Step12_result_allContrasts_FC5.00_DEG_Limma.csv")

Step13: Visualization of differential gene expression results from DEG_Limma

# Q-Q plot of moderated t-statistics
#http://web.mit.edu/~r/current/arch/i386_linux26/lib/R/library/limma/html/lmFit.html
pdf(file="Step13_plot_qqplot_eb_DEG_Limma.pdf",height=10,width=10)
par(mfrow=c(2,2))
qqt(eb_DEG_Limma$t[,"monocytesBYo"],df_DEG_Limma=eb_DEG_Limma$df.residual+eb_DEG_Limma$df.prior)
## Warning in plot.window(...): "df_DEG_Limma" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "df_DEG_Limma" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "df_DEG_Limma"
## is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "df_DEG_Limma"
## is not a graphical parameter
## Warning in box(...): "df_DEG_Limma" is not a graphical parameter
## Warning in title(...): "df_DEG_Limma" is not a graphical parameter
abline(0,1)
dev.off()
## png 
##   2
#https://www.biostars.org/p/203876/
pdf(file="Step13_plot_Volcano_result_adjFC5.00_DEG_Limma.pdf",height=10,width=10)
#par(mfrow=c(2,2))

df<-result_monocytesBYo_full_DEG_Limma
df$gene<-rownames(result_monocytesBYo_full_DEG_Limma) #create column for genes
mutateddf <- mutate(df, sig=ifelse(df$adj.P.Val<0.05, "adj.P.Val<0.05", "Not Sig")) 
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
       geom_point(aes(col=sig)) + #add points colored by significance
       scale_color_manual(values=c("red", "black")) + 
       ggtitle("Volcanoplot DEG_Limma")+
       geom_text_repel(data=head(mutateddf, 10), aes(label=gene)) 
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg") 
volc

dev.off()
## png 
##   2
#https://www.biostars.org/p/203876/
pdf(file="Step13_plot_Volcano_result_adjPValFC_DEG_Limma.pdf",height=10,width=10)
#par(mfrow=c(2,2))

df<-result_monocytesBYo_full_DEG_Limma
df$gene<-rownames(result_monocytesBYo_full_DEG_Limma) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig")) 
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
       geom_point(aes(col=sig)) + #add points colored by significance
       scale_color_manual(values=c("black", "red")) + 
       ggtitle("Volcanoplot DEG_Limma")+
       geom_text_repel(data=head(mutateddf, 10), aes(label=gene)) 
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg") 
volc

dev.off()
## png 
##   2
df<-result_monocytesBYo_full_DEG_Limma
df$gene<-rownames(result_monocytesBYo_full_DEG_Limma) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#show UP100 or  in bar plot
mutateddf_UP100 <-mutateddf[order(-mutateddf$logFC), ] #descenting order sort
mutateddf_UP100 <- mutateddf_UP100[1:100,] #top100 upregulated genes
  
#ggplot colors available http://sape.inf.usi.ch/quick-reference/ggplot2/colour
pdf(file="Step13_plot_barplot_result_adjPValUP100_DEG_Limma.pdf",height=5,width=10)
#Will have different colors depending on significance
bar1<-ggplot(mutateddf_UP100, aes(x=gene, y=logFC, width=.7))+
  geom_bar(position="stack", fill="red4", stat="identity")+
  labs(title=paste0("Barplot top genes with UP100 with significant adj.P.Val"))+
  theme(axis.text.x=element_text(angle=90,hjust=1))
  #labs(x="genes")+
  #labs(y="fold change")+
  #labs(caption="DEG_Limma")+
  #scale_fill_gradient(low="black",high="red")+
  #theme(plot.margin = margin(2,2,2,2, "cm"), plot.background = element_rect(fill = "white"))
print(bar1)
dev.off()
## png 
##   2
edata_keepDEG_Limma_UP100<- edata[which(rownames(edata) %in% mutateddf_UP100$gene), ]
dim(mutateddf_UP100)
## [1] 100  10
dim(edata_keepDEG_Limma_UP100)
## [1] 100  30
pdf(file="Step13_plot_heatmap_result_adjPValUP100_DEG_Limma.pdf",height=15,width=15)
  #mapdata<-edata[rownames(edata)==mutateddf$gene,]
  mapdata<-edata_keepDEG_Limma_UP100
  mapdata_matrix <- data.matrix(mapdata)
  colfunc<-colorRampPalette(c("red","green")) #because we are blue group! 
  hr<-hclust(as.dist(1-cor(t(mapdata_matrix),method="pearson")),method="complete")
  plotheatmap<-heatmap.2(mapdata_matrix,Rowv=as.dendrogram(hr),scale="row",density.info="none",trace="none",col=colfunc(256),cexRow=0.4,cexCol=0.4)
dev.off()
## png 
##   2

Step14: Keep only DEG_Limma genes from the edata

#genes which are sig differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Limma)
## [1] 856   8
DEG_Limmanames=rownames(result_allContrasts_FC5.00_DEG_Limma)
head(DEG_Limmanames)
## [1] "Gpr141"  "Gm9733"  "Sirpb1c" "Plbd1"   "Sirpb1a" "Gm5150"
length(DEG_Limmanames)
## [1] 856
#save result_allContrasts_full_DEG_Limma, result_allContrasts_FC5.00_DEG_Limma, DEG_Limmanames
save(result_allContrasts_full_DEG_Limma, result_allContrasts_FC5.00_DEG_Limma, DEG_Limmanames, file="DEG_Limma_data.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123    30
edata_keepDEG_Limma<- edata[which(rownames(edata) %in% DEG_Limmanames), ]
dim(edata_keepDEG_Limma)
## [1] 856  30
#unique and common genes between the different contrasts DEG_Limma are retained
DT::datatable(edata_keepDEG_Limma[1:7,1:7])
#Export edata keepDEG_Limma genes only
write.csv(edata_keepDEG_Limma, "Step14_result_edata_keepDEG_Limma.csv")

Step14: Visualization of edata DEG_Limma filter. Note that unfiltered was done under pca_unscaled

SVA none and keepDEG_Limma

edata_keepDEG_Limma filtered to keepDEG_Limma genes

#Summary Statistics edata_keepDEG_Limma
DT::datatable(summary(edata_keepDEG_Limma[1:7,1:7]))
write.csv(summary(edata_keepDEG_Limma),"Step14_result_summary_stats_edata_keepDEG_Limma.csv")
pdf(file="Step14_plot_Visualization_boxplot_MDS_PC_keepDEG_Limma_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))

                            ############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]

                            ############boxplot chunk###############
#Before boxplot edata_keepDEG_Limma
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_Limma, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_Limma", col="yellow")

                            ############MDSplot chunk###############
#Before MDSplot edata_keepDEG_Limma
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_Limma, col=condition_colors, legend= "all", main="edata_keepDEG_Limma", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
                    #######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_Limma, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_Limma), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -10.334895 -9.683975 7.7065138
## astro_SRR1033784  -9.513882 -9.781029 8.5960540
## endo_SRR1033795  -10.059100 -5.815683 0.8765435
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_Limma",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$CellType,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                             #########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_Limma, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_Limma), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -10.334895 -9.683975 7.7065138
## astro_SRR1033784  -9.513882 -9.781029 8.5960540
## endo_SRR1033795  -10.059100 -5.815683 0.8765435
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_Limma",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Species,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                                 #########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_Limma, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_Limma), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -10.334895 -9.683975 7.7065138
## astro_SRR1033784  -9.513882 -9.781029 8.5960540
## endo_SRR1033795  -10.059100 -5.815683 0.8765435
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_Limma",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Age,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

dev.off()
## png 
##   2

Step14: Preparing aggregate data expr or edata_keepDEG_Limma from metadata by variables

t_edata_keepDEG_Limma<-t(edata_keepDEG_Limma)
DT::datatable(t_edata_keepDEG_Limma[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_Limma)
## [1]  30 856
dim(pheno)
## [1] 30  5
#bind the pheno and edata_keepDEG_Limma, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_Limma<-cbind(pheno, t_edata_keepDEG_Limma)
dim(pheno_t_edata_keepDEG_Limma)
## [1]  30 861
DT::datatable(pheno_t_edata_keepDEG_Limma[,1:10])
write.csv(pheno_t_edata_keepDEG_Limma, "Step14_result_pheno_t_edata_keepDEG_Limma_All_Group_aggregate.csv")

Aggregade by CellType

#select the CellType pheno drop others
pheno_t_edata_keepDEG_Limma_CellType=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Limma_CellType)
## [1]  30 857
DT::datatable(pheno_t_edata_keepDEG_Limma_CellType[,1:10])
#aggregate edata_keepDEG_Limma or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_CellType)
pheno_t_edata_keepDEG_Limma_CellType_avg=pheno_t_edata_keepDEG_Limma_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_Limma_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_CellType_avg=t(pheno_t_edata_keepDEG_Limma_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_CellType_avg)=pheno_t_edata_keepDEG_Limma_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_Limma_CellType_avg)
## [1] 856   2
DT::datatable(t_pheno_t_edata_keepDEG_Limma_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_CellType_avg, "Step14_result_edata_keepDEG_Limma_CellType_aggregate.csv")

Aggregade by Species

#select the Species pheno drop others
pheno_t_edata_keepDEG_Limma_Species=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_Limma_Species)
## [1]  30 857
DT::datatable(pheno_t_edata_keepDEG_Limma_Species[,1:10])
#aggregate edata_keepDEG_Limma or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_Species)
pheno_t_edata_keepDEG_Limma_Species_avg=pheno_t_edata_keepDEG_Limma_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_Limma_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_Species_avg=t(pheno_t_edata_keepDEG_Limma_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_Species_avg)=pheno_t_edata_keepDEG_Limma_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_Limma_Species_avg)
## [1] 856   2
DT::datatable(t_pheno_t_edata_keepDEG_Limma_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_Species_avg, "Step14_result_edata_keepDEG_Limma_Species_aggregate.csv")

Aggregade by Age

#select the Age pheno drop others
pheno_t_edata_keepDEG_Limma_Age=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Limma_Age)
## [1]  30 857
DT::datatable(pheno_t_edata_keepDEG_Limma_Age[,1:10])
#aggregate edata_keepDEG_Limma or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_Age)
pheno_t_edata_keepDEG_Limma_Age_avg=pheno_t_edata_keepDEG_Limma_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_Limma_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_Age_avg=t(pheno_t_edata_keepDEG_Limma_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_Age_avg)=pheno_t_edata_keepDEG_Limma_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_Limma_Age_avg)
## [1] 856   2
DT::datatable(t_pheno_t_edata_keepDEG_Limma_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_Age_avg, "Step14_result_edata_keepDEG_Limma_Age_aggregate.csv")

Aggregade by Tissue

#select the Tissue pheno drop others
pheno_t_edata_keepDEG_Limma_Tissue=pheno_t_edata_keepDEG_Limma[!names(pheno_t_edata_keepDEG_Limma) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_Limma_Tissue)
## [1]  30 857
DT::datatable(pheno_t_edata_keepDEG_Limma_Tissue[,1:10])
#aggregate edata_keepDEG_Limma or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Limma_Tissue)
pheno_t_edata_keepDEG_Limma_Tissue_avg=pheno_t_edata_keepDEG_Limma_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_Limma_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Limma_Tissue_avg=t(pheno_t_edata_keepDEG_Limma_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Limma_Tissue_avg)=pheno_t_edata_keepDEG_Limma_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_Limma_Tissue_avg)
## [1] 856   4
DT::datatable(t_pheno_t_edata_keepDEG_Limma_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Limma_Tissue_avg, "Step14_result_edata_keepDEG_Limma_Tissue_aggregate.csv")

Aggregade all Groups

t_pheno_t_edata_keepDEG_Limma_all_avg<-cbind(t_pheno_t_edata_keepDEG_Limma_CellType_avg, t_pheno_t_edata_keepDEG_Limma_Species_avg, t_pheno_t_edata_keepDEG_Limma_Age_avg, t_pheno_t_edata_keepDEG_Limma_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_Limma_all_avg))
write.csv(t_pheno_t_edata_keepDEG_Limma_all_avg, "Step14_result_edata_keepDEG_Limma_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_Limma, t_pheno_t_edata_keepDEG_Limma_CellType_avg, t_pheno_t_edata_keepDEG_Limma_Species_avg, t_pheno_t_edata_keepDEG_Limma_Age_avg, t_pheno_t_edata_keepDEG_Limma_Tissue_avg, t_pheno_t_edata_keepDEG_Limma_all_avg
save(pheno_t_edata_keepDEG_Limma, t_pheno_t_edata_keepDEG_Limma_CellType_avg, t_pheno_t_edata_keepDEG_Limma_Species_avg, t_pheno_t_edata_keepDEG_Limma_Age_avg, t_pheno_t_edata_keepDEG_Limma_Tissue_avg, t_pheno_t_edata_keepDEG_Limma_all_avg, file="edata_keepDEG_Limma_aggregate_data.RData")

Step14: Visualization by heatmap and correlation of aggregate data expr or edata_keepDEG_Limma from metadata by variables

pdf(file="Step14_plot_boxplot_corr_plot_edata_keepDEG_Limma_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))

                    #########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_Limma_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_CellType_avg", col="yellow")

#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_Limma_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_Species_avg", col="yellow")

#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_Limma_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_Age_avg", col="yellow")

#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_Limma_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_Tissue_avg", col="yellow")

#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_Limma_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Limma_all_avg", col="yellow")

                    #########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_CellType_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_Species_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_Age_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_Tissue_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Limma_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step14_result_edata_keepDEG_Limma_all_Group_corr_pval_aggregate.csv")

dev.off()
## png 
##   2
pdf(file="Step14_plot_density_edata_keepDEG_Limma_aka_expr_aggregate.pdf",height=10,width=10)

                    #########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_Limma_CellType_avg", y = "Density")


#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_Limma_Species_avg", y = "Density")
  

#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_Limma_Age_avg", y = "Density")

#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_Limma_Tissue_avg", y = "Density")

#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_Limma_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Limma_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_Limma_all_avg", y = "Density") 

dev.off()
## png 
##   2

Clear up workspace and retain required dataframes

#save pheno, edata
save(pheno, metadata_1, edata, edata_keepDEG_Limma, file="edata_keepDEG_Limma_data.RData")
#save image
save.image(file="DEG_Limma_temp.RData")
rm(list=ls())
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 3991453 213.2    8073370 431.2  8073370 431.2
## Vcells 7483579  57.1   29333001 223.8 29332762 223.8

Step15: Differentially expressed genes DEG Simple division of arithmetic means of expression data so easier to interpret like cuffdiff (published CGGA code)

#Load pheno, edata required for steps below
load(file="done_pre_Step8.RData")
gc() 
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4191350 223.9    8073370 431.2  8073370 431.2
## Vcells 16868874 128.7   29333001 223.8 29332762 223.8
#load required libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggmap)#plots
library(gplots)#plots
library(ggrepel) #Avoid overlapping labels
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
library(dplyr)

Input data wil be converted from log2 space to antilog so that direct division of means will be performed for determining differentially expressed genes

colnames(edata)==rownames(pheno)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE
#This should be all TRUE 
#DT::datatable(edata[1:3,1:3])
edata[1:3,1:3]
##               astro_SRR1033783 astro_SRR1033784 endo_SRR1033795
## 0610009B22Rik         4.955948         4.845806       4.4090832
## 0610009E02Rik         1.459870         1.113146       0.3603799
## 0610009L18Rik         4.215835         3.674934       1.6072611
#From here identify column numbers of interest for next step
head(pheno)
##                        CellType      Age     Study         Species
## astro_SRR1033783          other  No_data  GSE52564 mouse_FVN_Swiss
## astro_SRR1033784          other  No_data  GSE52564 mouse_FVN_Swiss
## endo_SRR1033795           other  No_data  GSE52564 mouse_FVN_Swiss
## endo_SRR1033796           other  No_data  GSE52564 mouse_FVN_Swiss
## macrophages_SRR1106141    other 5_months SRP035309    mouse_C57BL6
## macrophages_SRR1106142    other 5_months SRP035309    mouse_C57BL6
##                                 Tissue
## astro_SRR1033783       cerebral_cortex
## astro_SRR1033784       cerebral_cortex
## endo_SRR1033795        cerebral_cortex
## endo_SRR1033796        cerebral_cortex
## macrophages_SRR1106141      peritoneum
## macrophages_SRR1106142      peritoneum

DEG_Simple Differentially expressed genes monocytesBYo

#Create index to subset expression data to find the differentially expressed genes .
index_O=which(pheno$CellType=='other')
index_AS=which(pheno$CellType=='monocytes')
#subset expression data to find the differentially expressed genes .
edata_monocytesBYo<-edata[,c(index_AS,index_O)]
head(edata_monocytesBYo)
##               monocytes_SRR5483448 monocytes_SRR5483449
## 0610009B22Rik            5.0429266            4.6151396
## 0610009E02Rik            0.8453395            0.8159736
## 0610009L18Rik            3.9059140            3.9546957
## 0610009O20Rik            2.1779053            2.2899918
## 0610010F05Rik            2.7444755            2.6264154
## 0610012G03Rik            7.2559842            7.2598248
##               monocytes_SRR5483450 monocytes_SRR5483451
## 0610009B22Rik             4.972788            4.2654510
## 0610009E02Rik             1.407900            0.5081903
## 0610009L18Rik             3.179912            2.9825106
## 0610009O20Rik             2.278137            2.1253128
## 0610010F05Rik             2.507282            2.8541869
## 0610012G03Rik             7.524382            6.7149669
##               monocytes_SRR5483452 astro_SRR1033783 astro_SRR1033784
## 0610009B22Rik             3.974523         4.955948         4.845806
## 0610009E02Rik             1.011250         1.459870         1.113146
## 0610009L18Rik             2.272594         4.215835         3.674934
## 0610009O20Rik             2.210888         6.310704         6.183138
## 0610010F05Rik             3.010300         2.772554         2.804534
## 0610012G03Rik             6.625560         6.950474         6.846606
##               endo_SRR1033795 endo_SRR1033796 macrophages_SRR1106141
## 0610009B22Rik       4.4090832        4.502667               5.602279
## 0610009E02Rik       0.3603799        0.243692               0.000000
## 0610009L18Rik       1.6072611        1.952993               2.906423
## 0610009O20Rik       4.8231150        4.848143               3.385650
## 0610010F05Rik       1.8905326        1.971668               1.206407
## 0610012G03Rik       6.9670158        7.052615               9.115554
##               macrophages_SRR1106142 macrophages_SRR1106143
## 0610009B22Rik              5.4082505              5.4189456
## 0610009E02Rik              0.3667327              0.0000000
## 0610009L18Rik              2.5190074              3.2384505
## 0610009O20Rik              3.8842368              2.8918854
## 0610010F05Rik              1.2885642              0.7934928
## 0610012G03Rik              8.9263649              9.2219219
##               macrophages_SRR1106169 macrophages_SRR1106186
## 0610009B22Rik               5.902053              5.4959670
## 0610009E02Rik               0.000000              0.3710340
## 0610009L18Rik               3.868483              3.1685405
## 0610009O20Rik               3.441345              3.3645188
## 0610010F05Rik               1.189148              0.8316789
## 0610012G03Rik               9.003139              8.8119698
##               macrophages_SRR1106188 microglia_SRR1033793
## 0610009B22Rik              5.6169046            2.0202852
## 0610009E02Rik              0.0000000            0.5107171
## 0610009L18Rik              2.7843654            3.7203000
## 0610009O20Rik              3.5556140            5.2641770
## 0610010F05Rik              0.9867346            0.3812806
## 0610012G03Rik              9.0534257            7.3788153
##               microglia_SRR1033794 microglia_SRR1106118
## 0610009B22Rik            2.6798393            5.6552688
## 0610009E02Rik            0.2378286            0.1054247
## 0610009L18Rik            3.0468366            2.5747633
## 0610009O20Rik            5.5836801            3.3889920
## 0610010F05Rik            0.3039537            0.2031030
## 0610012G03Rik            7.3411645            7.7680973
##               microglia_SRR1106122 microglia_SRR5483445
## 0610009B22Rik            5.8137871             6.581153
## 0610009E02Rik            0.0000000             1.046593
## 0610009L18Rik            3.5681973             4.507416
## 0610009O20Rik            3.5025661             3.645392
## 0610010F05Rik            0.3484909             2.591149
## 0610012G03Rik            8.0522739             7.182036
##               microglia_SRR5483446 microglia_SRR5483447
## 0610009B22Rik            6.7364702            6.6051291
## 0610009E02Rik            0.6805091            0.7372848
## 0610009L18Rik            4.7608643            4.8392993
## 0610009O20Rik            3.8413100            3.7905855
## 0610010F05Rik            2.4517537            2.6293385
## 0610012G03Rik            7.2595420            7.2184996
##               myeoligo_SRR1033791 myeoligo_SRR1033792 neuron_SRR1033785
## 0610009B22Rik          5.13165355           4.9628127          5.074496
## 0610009E02Rik          0.03926298           0.2194104          1.179335
## 0610009L18Rik          2.59376022           3.4391757          3.614653
## 0610009O20Rik          4.77915518           4.9189658          5.480809
## 0610010F05Rik          1.62141301           1.7297040          3.689563
## 0610012G03Rik          7.41307069           7.4875229          7.607235
##               neuron_SRR1033786 newoligo_SRR1033789 newoligo_SRR1033790
## 0610009B22Rik          5.031467           5.4321841           5.1954989
## 0610009E02Rik          1.716639           0.5480982           0.8470736
## 0610009L18Rik          3.510926           3.8301179           4.3732173
## 0610009O20Rik          5.352959           5.4502942           5.5312406
## 0610010F05Rik          4.010739           3.0207377           1.8599674
## 0610012G03Rik          7.540919           7.3616935           7.8424973
##               preoligo_SRR1033787 preoligo_SRR1033788
## 0610009B22Rik           5.1328847           4.9719015
## 0610009E02Rik           0.6893239           0.4045672
## 0610009L18Rik           3.8282169           4.5291014
## 0610009O20Rik           5.8388370           6.0930914
## 0610010F05Rik           2.8053688           1.6266376
## 0610012G03Rik           7.1233211           8.0684096
#Differentially expressed genes between "monocytesBYo"
t_monocytesBYo<-apply(edata_monocytesBYo,1,function(x){
  mod<-t.test(x[index_O],x[index_AS])
  meanx<-mean(2^x[index_O]-1) #because we had log2+1
  meany<-mean(2^x[index_AS]-1)#because we had log2+1
  tvalue<-mod[1]$statistic[[1]]
  FC<-meany/meanx
  logFC<-log2(FC)
  pvalue<-mod[3]$p.value
  Contrast<-"monocytesBYo"
  c(meanx=meanx,meany=meany,tvalue=tvalue,FC=FC,logFC=logFC, pvalue=pvalue, Contrast=Contrast)
})
t_monocytesBYo<-t(t_monocytesBYo)
adj.P.Val<-p.adjust(t_monocytesBYo[,'pvalue'], method="BH")
#DEG_Simple for contrast "monocytesBYo"
result_monocytesBYo_full_DEG_Simple<-cbind(t_monocytesBYo,adj.P.Val)
result_monocytesBYo_full_DEG_Simple<-as.data.frame(result_monocytesBYo_full_DEG_Simple)
cols = c("meanx","meany", "tvalue", "FC", "logFC", "pvalue", "adj.P.Val")  
result_monocytesBYo_full_DEG_Simple[,cols] = apply(result_monocytesBYo_full_DEG_Simple[,cols], 2, function(x) as.numeric(as.character(x)))
result_monocytesBYo_full_DEG_Simple = result_monocytesBYo_full_DEG_Simple[order(-result_monocytesBYo_full_DEG_Simple$logFC), ] #order by descending fold change

result_monocytesBYo_FC5.00_DEG_Simple<- result_monocytesBYo_full_DEG_Simple[which(result_monocytesBYo_full_DEG_Simple$FC >= 5.00), ]

dim(result_monocytesBYo_full_DEG_Simple)
## [1] 22123     8
dim(result_monocytesBYo_FC5.00_DEG_Simple)
## [1] 977   8
write.csv(result_monocytesBYo_full_DEG_Simple, "Step15_result_monocytesBYo_full_DEG_Simple.csv")
write.csv(result_monocytesBYo_FC5.00_DEG_Simple, "Step15_result_monocytesBYo_FC5.00_DEG_Simple.csv")

summary(result_monocytesBYo_full_DEG_Simple)
##      meanx              meany              tvalue        
##  Min.   :    0.00   Min.   :    0.00   Min.   :-18.5227  
##  1st Qu.:    0.62   1st Qu.:    0.13   1st Qu.: -0.5239  
##  Median :    3.44   Median :    1.24   Median :  0.7163  
##  Mean   :   45.15   Mean   :   45.16   Mean   :  0.8929  
##  3rd Qu.:   15.06   3rd Qu.:    9.63   3rd Qu.:  2.2504  
##  Max.   :31888.88   Max.   :65697.25   Max.   : 16.1071  
##        FC              logFC             pvalue       
##  Min.   :0.00000   Min.   :   -Inf   Min.   :0.00000  
##  1st Qu.:0.07578   1st Qu.:-3.7221   1st Qu.:0.02265  
##  Median :0.56959   Median :-0.8120   Median :0.18166  
##  Mean   :    Inf   Mean   :    NaN   Mean   :0.31158  
##  3rd Qu.:1.26243   3rd Qu.: 0.3362   3rd Qu.:0.56535  
##  Max.   :    Inf   Max.   :    Inf   Max.   :1.00000  
##          Contrast       adj.P.Val      
##  monocytesBYo:22123   Min.   :0.00000  
##                       1st Qu.:0.09058  
##                       Median :0.36331  
##                       Mean   :0.42953  
##                       3rd Qu.:0.75378  
##                       Max.   :1.00000
summary(result_monocytesBYo_FC5.00_DEG_Simple)
##      meanx               meany              tvalue        
##  Min.   :    0.000   Min.   :    0.31   Min.   :-18.5227  
##  1st Qu.:    0.138   1st Qu.:    1.86   1st Qu.: -3.0589  
##  Median :    0.464   Median :    5.79   Median : -2.1137  
##  Mean   :   37.624   Mean   :  345.75   Mean   : -2.8363  
##  3rd Qu.:    3.233   3rd Qu.:   35.97   3rd Qu.: -1.5906  
##  Max.   :12011.697   Max.   :65697.25   Max.   : -0.4858  
##        FC             logFC           pvalue                Contrast  
##  Min.   : 5.000   Min.   :2.322   Min.   :0.00000   monocytesBYo:977  
##  1st Qu.: 6.438   1st Qu.:2.687   1st Qu.:0.03180                     
##  Median : 8.724   Median :3.125   Median :0.09803                     
##  Mean   :   Inf   Mean   :  Inf   Mean   :0.13293                     
##  3rd Qu.:17.375   3rd Qu.:4.119   3rd Qu.:0.18229                     
##  Max.   :   Inf   Max.   :  Inf   Max.   :0.65005                     
##    adj.P.Val     
##  Min.   :0.0000  
##  1st Qu.:0.1158  
##  Median :0.2499  
##  Mean   :0.2659  
##  3rd Qu.:0.3641  
##  Max.   :0.8124
#Bind all differentally expressed full genes into a single table
result_allContrasts_full_DEG_Simple<-rbind(result_monocytesBYo_full_DEG_Simple) #here only one table so rbind not required
dim(result_allContrasts_full_DEG_Simple)
## [1] 22123     8
write.csv(result_allContrasts_full_DEG_Simple, "Step15_result_allContrasts_full_DEG_Simple.csv")

#Bind all differentally expressed pval genes into a single table
result_allContrasts_FC5.00_DEG_Simple<-rbind(result_monocytesBYo_FC5.00_DEG_Simple)#here only one table so rbind not required
dim(result_allContrasts_FC5.00_DEG_Simple)
## [1] 977   8
write.csv(result_allContrasts_FC5.00_DEG_Simple, "Step15_result_allContrasts_FC5.00_DEG_Simple.csv")

Step15: Visualization of differential gene expression results from DEG_Simple

#https://www.biostars.org/p/203876/
pdf(file="Step15_plot_Volcano_result_adjFC5.00_DEG_Simple.pdf",height=10,width=10)
#par(mfrow=c(2,2))

df<-result_monocytesBYo_full_DEG_Simple
df$gene<-rownames(result_monocytesBYo_full_DEG_Simple) #create column for genes
mutateddf <- mutate(df, sig=ifelse(df$adj.P.Val<0.05, "adj.P.Val<0.05", "Not Sig")) 
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
       geom_point(aes(col=sig)) + #add points colored by significance
       scale_color_manual(values=c("red", "black")) + 
       ggtitle("Volcanoplot DEG_Simple")+
       geom_text_repel(data=head(mutateddf, 10), aes(label=gene)) 
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg") 
volc

dev.off()
## png 
##   2
#https://www.biostars.org/p/203876/
pdf(file="Step15_plot_Volcano_result_adjPValFC_DEG_Simple.pdf",height=10,width=10)
#par(mfrow=c(2,2))

df<-result_monocytesBYo_full_DEG_Simple
df$gene<-rownames(result_monocytesBYo_full_DEG_Simple) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig")) 
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
       geom_point(aes(col=sig)) + #add points colored by significance
       scale_color_manual(values=c("black", "red")) + 
       ggtitle("Volcanoplot DEG_Simple")+
       geom_text_repel(data=head(mutateddf, 10), aes(label=gene)) 
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg") 
volc

dev.off()
## png 
##   2
df<-result_monocytesBYo_full_DEG_Simple
df$gene<-rownames(result_monocytesBYo_full_DEG_Simple) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#show UP100 or  in bar plot
mutateddf_UP100 <-mutateddf[order(-mutateddf$logFC), ] #descenting order sort
mutateddf_UP100 <- mutateddf_UP100[1:100,] #top100 upregulated genes
  
#ggplot colors available http://sape.inf.usi.ch/quick-reference/ggplot2/colour
pdf(file="Step15_plot_barplot_result_adjPValUP100_DEG_Simple.pdf",height=5,width=10)
#Will have different colors depending on significance
bar1<-ggplot(mutateddf_UP100, aes(x=gene, y=logFC, width=.7))+
  geom_bar(position="stack", fill="red4", stat="identity")+
  labs(title=paste0("Barplot top genes with UP100 with significant adj.P.Val"))+
  theme(axis.text.x=element_text(angle=90,hjust=1))
  #labs(x="genes")+
  #labs(y="fold change")+
  #labs(caption="DEG_Simple")+
  #scale_fill_gradient(low="black",high="red")+
  #theme(plot.margin = margin(2,2,2,2, "cm"), plot.background = element_rect(fill = "white"))
print(bar1)
dev.off()
## png 
##   2
edata_keepDEG_Simple_UP100<- edata[which(rownames(edata) %in% mutateddf_UP100$gene), ]
dim(mutateddf_UP100)
## [1] 100  10
dim(edata_keepDEG_Simple_UP100)
## [1] 100  30
pdf(file="Step15_plot_heatmap_result_adjPValUP100_DEG_Simple.pdf",height=15,width=15)
  #mapdata<-edata[rownames(edata)==mutateddf$gene,]
  mapdata<-edata_keepDEG_Simple_UP100
  mapdata_matrix <- data.matrix(mapdata)
  colfunc<-colorRampPalette(c("red","green")) #because we are blue group! 
  hr<-hclust(as.dist(1-cor(t(mapdata_matrix),method="pearson")),method="complete")
  plotheatmap<-heatmap.2(mapdata_matrix,Rowv=as.dendrogram(hr),scale="row",density.info="none",trace="none",col=colfunc(256),cexRow=0.4,cexCol=0.4)
dev.off()
## png 
##   2

Step16: Keep only DEG_Simple genes from the edata

#genes which are sig differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Simple)
## [1] 977   8
DEG_Simplenames=rownames(result_allContrasts_FC5.00_DEG_Simple)
head(DEG_Simplenames)
## [1] "Gm13272" "Gm17806" "Gm18701" "Gm18921" "Gm20594" "Gm20878"
length(DEG_Simplenames)
## [1] 977
#save result_allContrasts_full_DEG_Simple, result_allContrasts_FC5.00_DEG_Simple, DEG_Simplenames
save(result_allContrasts_full_DEG_Simple, result_allContrasts_FC5.00_DEG_Simple, DEG_Simplenames, file="DEG_Simple_data.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123    30
edata_keepDEG_Simple<- edata[which(rownames(edata) %in% DEG_Simplenames), ]
dim(edata_keepDEG_Simple)
## [1] 977  30
#unique and common genes between the different contrasts DEG_Simple are retained
DT::datatable(edata_keepDEG_Simple[1:7,1:7])
#Export edata keepDEG_Simple genes only
write.csv(edata_keepDEG_Simple, "Step16_result_edata_keepDEG_Simple.csv")

Step16: Visualization of edata DEG_Simple filter. Note that unfiltered was done under pca_unscaled

SVA none and keepDEG_Simple

edata_keepDEG_Simple filtered to keepDEG_Simple genes

#Summary Statistics edata_keepDEG_Simple
DT::datatable(summary(edata_keepDEG_Simple[1:7,1:7]))
write.csv(summary(edata_keepDEG_Simple),"Step16_result_summary_stats_edata_keepDEG_Simple.csv")
pdf(file="Step16_plot_Visualization_boxplot_MDS_PC_keepDEG_Simple_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))

                            ############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]

                            ############boxplot chunk###############
#Before boxplot edata_keepDEG_Simple
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_Simple, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_Simple", col="yellow")

                            ############MDSplot chunk###############
#Before MDSplot edata_keepDEG_Simple
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_Simple, col=condition_colors, legend= "all", main="edata_keepDEG_Simple", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
                    #######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_Simple, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_Simple), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2      PC3
## astro_SRR1033783  -7.997409 -8.062666 5.558321
## astro_SRR1033784  -9.062199 -7.426955 5.687962
## endo_SRR1033795  -12.495851 -5.830971 4.031279
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_Simple",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$CellType,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                             #########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_Simple, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_Simple), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2      PC3
## astro_SRR1033783  -7.997409 -8.062666 5.558321
## astro_SRR1033784  -9.062199 -7.426955 5.687962
## endo_SRR1033795  -12.495851 -5.830971 4.031279
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_Simple",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Species,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                                 #########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_Simple, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_Simple), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2      PC3
## astro_SRR1033783  -7.997409 -8.062666 5.558321
## astro_SRR1033784  -9.062199 -7.426955 5.687962
## endo_SRR1033795  -12.495851 -5.830971 4.031279
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_Simple",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Age,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

dev.off()
## png 
##   2

Step16: Preparing aggregate data expr or edata_keepDEG_Simple from metadata by variables

t_edata_keepDEG_Simple<-t(edata_keepDEG_Simple)
DT::datatable(t_edata_keepDEG_Simple[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_Simple)
## [1]  30 977
dim(pheno)
## [1] 30  5
#bind the pheno and edata_keepDEG_Simple, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_Simple<-cbind(pheno, t_edata_keepDEG_Simple)
dim(pheno_t_edata_keepDEG_Simple)
## [1]  30 982
DT::datatable(pheno_t_edata_keepDEG_Simple[,1:10])
write.csv(pheno_t_edata_keepDEG_Simple, "Step16_result_pheno_t_edata_keepDEG_Simple_All_Group_aggregate.csv")

Aggregade by CellType

#select the CellType pheno drop others
pheno_t_edata_keepDEG_Simple_CellType=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Simple_CellType)
## [1]  30 978
DT::datatable(pheno_t_edata_keepDEG_Simple_CellType[,1:10])
#aggregate edata_keepDEG_Simple or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_CellType)
pheno_t_edata_keepDEG_Simple_CellType_avg=pheno_t_edata_keepDEG_Simple_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_Simple_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_CellType_avg=t(pheno_t_edata_keepDEG_Simple_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_CellType_avg)=pheno_t_edata_keepDEG_Simple_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_Simple_CellType_avg)
## [1] 977   2
DT::datatable(t_pheno_t_edata_keepDEG_Simple_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_CellType_avg, "Step16_result_edata_keepDEG_Simple_CellType_aggregate.csv")

Aggregade by Species

#select the Species pheno drop others
pheno_t_edata_keepDEG_Simple_Species=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_Simple_Species)
## [1]  30 978
DT::datatable(pheno_t_edata_keepDEG_Simple_Species[,1:10])
#aggregate edata_keepDEG_Simple or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_Species)
pheno_t_edata_keepDEG_Simple_Species_avg=pheno_t_edata_keepDEG_Simple_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_Simple_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_Species_avg=t(pheno_t_edata_keepDEG_Simple_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_Species_avg)=pheno_t_edata_keepDEG_Simple_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_Simple_Species_avg)
## [1] 977   2
DT::datatable(t_pheno_t_edata_keepDEG_Simple_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_Species_avg, "Step16_result_edata_keepDEG_Simple_Species_aggregate.csv")

Aggregade by Age

#select the Age pheno drop others
pheno_t_edata_keepDEG_Simple_Age=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_Simple_Age)
## [1]  30 978
DT::datatable(pheno_t_edata_keepDEG_Simple_Age[,1:10])
#aggregate edata_keepDEG_Simple or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_Age)
pheno_t_edata_keepDEG_Simple_Age_avg=pheno_t_edata_keepDEG_Simple_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_Simple_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_Age_avg=t(pheno_t_edata_keepDEG_Simple_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_Age_avg)=pheno_t_edata_keepDEG_Simple_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_Simple_Age_avg)
## [1] 977   2
DT::datatable(t_pheno_t_edata_keepDEG_Simple_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_Age_avg, "Step16_result_edata_keepDEG_Simple_Age_aggregate.csv")

Aggregade by Tissue

#select the Tissue pheno drop others
pheno_t_edata_keepDEG_Simple_Tissue=pheno_t_edata_keepDEG_Simple[!names(pheno_t_edata_keepDEG_Simple) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_Simple_Tissue)
## [1]  30 978
DT::datatable(pheno_t_edata_keepDEG_Simple_Tissue[,1:10])
#aggregate edata_keepDEG_Simple or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_Simple_Tissue)
pheno_t_edata_keepDEG_Simple_Tissue_avg=pheno_t_edata_keepDEG_Simple_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_Simple_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_Simple_Tissue_avg=t(pheno_t_edata_keepDEG_Simple_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_Simple_Tissue_avg)=pheno_t_edata_keepDEG_Simple_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_Simple_Tissue_avg)
## [1] 977   4
DT::datatable(t_pheno_t_edata_keepDEG_Simple_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_Simple_Tissue_avg, "Step16_result_edata_keepDEG_Simple_Tissue_aggregate.csv")

Aggregade all Groups

t_pheno_t_edata_keepDEG_Simple_all_avg<-cbind(t_pheno_t_edata_keepDEG_Simple_CellType_avg, t_pheno_t_edata_keepDEG_Simple_Species_avg, t_pheno_t_edata_keepDEG_Simple_Age_avg, t_pheno_t_edata_keepDEG_Simple_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_Simple_all_avg))
write.csv(t_pheno_t_edata_keepDEG_Simple_all_avg, "Step16_result_edata_keepDEG_Simple_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_Simple, t_pheno_t_edata_keepDEG_Simple_CellType_avg, t_pheno_t_edata_keepDEG_Simple_Species_avg, t_pheno_t_edata_keepDEG_Simple_Age_avg, t_pheno_t_edata_keepDEG_Simple_Tissue_avg, t_pheno_t_edata_keepDEG_Simple_all_avg
save(pheno_t_edata_keepDEG_Simple, t_pheno_t_edata_keepDEG_Simple_CellType_avg, t_pheno_t_edata_keepDEG_Simple_Species_avg, t_pheno_t_edata_keepDEG_Simple_Age_avg, t_pheno_t_edata_keepDEG_Simple_Tissue_avg, t_pheno_t_edata_keepDEG_Simple_all_avg, file="edata_keepDEG_Simple_aggregate_data.RData")

Step16: Visualization by heatmap and correlation of aggregate data expr or edata_keepDEG_Simple from metadata by variables

pdf(file="Step16_plot_boxplot_corr_plot_edata_keepDEG_Simple_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))

                    #########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_Simple_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_CellType_avg", col="yellow")

#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_Simple_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_Species_avg", col="yellow")

#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_Simple_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_Age_avg", col="yellow")

#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_Simple_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_Tissue_avg", col="yellow")

#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_Simple_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_Simple_all_avg", col="yellow")

                    #########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_CellType_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_Species_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_Age_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_Tissue_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_Simple_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step16_result_edata_keepDEG_Simple_all_Group_corr_pval_aggregate.csv")

dev.off()
## png 
##   2
pdf(file="Step16_plot_density_edata_keepDEG_Simple_aka_expr_aggregate.pdf",height=10,width=10)

                    #########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_Simple_CellType_avg", y = "Density")


#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_Simple_Species_avg", y = "Density")
  

#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_Simple_Age_avg", y = "Density")

#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_Simple_Tissue_avg", y = "Density")

#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_Simple_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_Simple_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_Simple_all_avg", y = "Density") 

dev.off()
## png 
##   2

Clear up workspace and retain required dataframes

#save pheno, edata 
save(pheno, metadata_1, edata, edata_keepDEG_Simple, file="edata_keepDEG_Simple_data.RData")
#save image
save.image(file="DEG_Simple_temp.RData")
rm(list=ls())
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 3997925 213.6    8073370 431.2  8073370 431.2
## Vcells 7510335  57.3   29333001 223.8 29333000 223.8

Step17: Differentially expressed genes DEG_edgeR method

#Load pheno, edata and edata_combatY required for steps below
load(file="done_pre_Step8.RData")
gc() 
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4197797 224.2    8073370 431.2  8073370 431.2
## Vcells 16900243 129.0   29333001 223.8 29333000 223.8
#load required libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("sva","limma","bladderbatch","pamr"))
library(limma)
library(sva)
library(pamr)
library(ggplot2) # plot
library(ggrepel) #Avoid overlapping labels
library(ggmap)#plots
library(gplots)#plots
library(RColorBrewer)#color pallet
library(Hmisc)#for corelation plot
library(viridis)#for corelation plot
library(corrplot)#for corelation plot
library(dplyr)
library(edgeR) # for RNAseq DE testing 

Input data

#From here identify column numbers of interest for next step
colnames(pheno)
## [1] "CellType" "Age"      "Study"    "Species"  "Tissue"
#edgeR DGElist
#Note here we use the generic DEG_edgeR differential expression calculation step
#mod only includes the variables of interest and covariants, not surrogate variables
#https://bioconductor.org/packages/release/bioc/manuals/edgeR/man/edgeR.pdf
#https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
#Creating a DGEList object: a container for the counts, metadata - these include, for example, sample names, gene names and normalisation factors once these are computed. The DGEList is an example of the custom task-specific structures that are frequently used in Bioconductor to make analyses easier.

#counts must be positive finite values so we take antilog of the edata which is log2 space

counts=(2^edata-1)#because we had log2+1
length(which(apply(counts, 1, function(row) any(row <0)))) #number of negative values
## [1] 0
#option1 to replace negative values with column mean
counts[counts<0] <-NA #replace negative values with NA
if_na(counts)=t(mean_col(counts)) #replace NA with mean of column or sample

#option2 to remove negative value containing rows
#counts[counts<0] <-NA #replace negative values with NA
#counts<-counts[!is.na(counts)]

dgList <- DGEList(counts=counts, genes=rownames(edata))  
#dgList <- DGEList(counts=b, genes=rownames(b))
dim(dgList)#lib.size is calculated by default
## [1] 22123    30
dgList[1:3,1:3]
## An object of class "DGEList"
## $counts
##               astro_SRR1033783 astro_SRR1033784 endo_SRR1033795
## 0610009B22Rik        30.037654        27.756307      20.2454671
## 0610009E02Rik         1.750835         1.163168       0.2837639
## 0610009L18Rik        17.582018        11.772189       2.0467289
## 
## $samples
##                  group lib.size norm.factors
## astro_SRR1033783     1 999373.1            1
## astro_SRR1033784     1 999271.4            1
## endo_SRR1033795      1 999719.2            1
## 
## $genes
##                       genes
## 0610009B22Rik 0610009B22Rik
## 0610009E02Rik 0610009E02Rik
## 0610009L18Rik 0610009L18Rik
#edgeR: Normalization calcNormFactors
#Normalization:  important to normalise RNA-seq both within and between samples. edgeR has trimmed mean of M-values (TMM) method, and other options also available. TMM normalization lessens the impact of reads falling into a small set of genes
#since our edata is already loOtpm normalized we skip thi step, if using raw counts like htseq2Counts then use this
#dgList <- calcNormFactors(dgList, method="TMM")
#dgList[1:3,1:3] #new lib.size re-calculated is now added here
#lib.size norm factor <1 indicates small number of high counts are monopolizing seq data causing other genes to be lower, so library size is scaled down. Conversely, for >1 library size is scaled up analogous to downscaling the counts. 
#edgeR: create model matrix
#Contrasts can be applied only to factors with 2 or more levels.

mod_DEG_edgeR = model.matrix(~0+CellType+Study, data=pheno)

#coefficients available for edgeR differential gene expression
DT::datatable(mod_DEG_edgeR)
#Coefficients if not estimable i.e. gives NAs in coefficient then remove. Try to use the max number of variables
#mod_DEG_edgeR = model.matrix(~0+CellType+Study, data=pheno) #Tissue and Study is same so removed Tissue
#mod_DEG_edgeR = model.matrix(~0+CellType+Study+Species+Tissue+Age, data=pheno) 
#mod_DEG_edgeR = model.matrix(~0+CellType+Species+Age, data=pheno)
#edgeR: estimate dispersion use model matrix
dgList <- estimateGLMCommonDisp(dgList, design=mod_DEG_edgeR)
dgList <- estimateGLMTrendedDisp(dgList, design=mod_DEG_edgeR)
#Note that either the common or trended dispersion needs to be estimated before we can estimate the tagwise dispersion.
dgList <- estimateGLMTagwiseDisp(dgList, design=mod_DEG_edgeR)
#edgeR model fitting
fit_DEG_edgeR <- glmFit(dgList, mod_DEG_edgeR)
summary(fit_DEG_edgeR)
##                       Length Class            Mode     
## coefficients           88492 -none-           numeric  
## fitted.values         663690 -none-           numeric  
## deviance               22123 -none-           numeric  
## method                     1 -none-           character
## counts                663690 -none-           numeric  
## unshrunk.coefficients  88492 -none-           numeric  
## df.residual            22123 -none-           numeric  
## design                   120 -none-           numeric  
## offset                663690 CompressedMatrix numeric  
## dispersion             22123 -none-           numeric  
## prior.count                1 -none-           numeric  
## samples                    3 data.frame       list     
## genes                      1 data.frame       list     
## prior.df                   1 -none-           numeric  
## AveLogCPM              22123 -none-           numeric
DT::datatable(fit_DEG_edgeR$coefficients)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#Use the values in the creating_CellType_contrasts as the first three values of contrast matrix for CellType 
design_DEG_edgeR<-model.matrix(~0+CellType, data=pheno)
colnames(design_DEG_edgeR)
## [1] "CellTypemonocytes" "CellTypeother"
design_DEG_edgeR[1:3,]
##                  CellTypemonocytes CellTypeother
## astro_SRR1033783                 0             1
## astro_SRR1033784                 0             1
## endo_SRR1033795                  0             1
DT::datatable(design_DEG_edgeR)
#comparisons you want write here
creating_CellType_contrasts_DEG_edgeR<-makeContrasts(monocytesBYo=CellTypemonocytes-CellTypeother, levels =design_DEG_edgeR)
creating_CellType_contrasts_DEG_edgeR
##                    Contrasts
## Levels              monocytesBYo
##   CellTypemonocytes            1
##   CellTypeother               -1
#Create contrasts to find the differentially expressed genes .
#We hold other variables and surrogate variables at 0. 
#Here the net number of rows is same and correspond to columns in fit_DEG_edgeR$coefficients
contrast.matrix_DEG_edgeR <- cbind("monocytesBYo"=c(1,-1,rep(0,2)))
contrast.matrix_DEG_edgeR
##      monocytesBYo
## [1,]            1
## [2,]           -1
## [3,]            0
## [4,]            0
#Identification of differentially expressed genes for the contrasts
#https://rstudio-pubs-static.s3.amazonaws.com/79395_b07ae39ce8124a5c873bd46d6075c137.html
#https://support.bioconductor.org/p/65751/
#https://support.bioconductor.org/p/47643/
lrt_monocytesBYo_DEG_edgeR <- glmLRT(fit_DEG_edgeR, contrast=contrast.matrix_DEG_edgeR[,"monocytesBYo"])
#DEG_edgeR for contrast "monocytesBYo"=CellTypemonocytes-CellTypeother 
result_monocytesBYo_full_DEG_edgeR<- topTags(lrt_monocytesBYo_DEG_edgeR, n = nrow(edata), adjust.method="BH")$table
names(result_monocytesBYo_full_DEG_edgeR)[names(result_monocytesBYo_full_DEG_edgeR) == 'FDR'] <- 'adj.P.Val'
result_monocytesBYo_full_DEG_edgeR$Contrast<-rep("monocytesBYo",nrow(result_monocytesBYo_full_DEG_edgeR))
result_monocytesBYo_full_DEG_edgeR$FC<- 2^(result_monocytesBYo_full_DEG_edgeR$logFC)
#drop extra genes column
result_monocytesBYo_full_DEG_edgeR <- result_monocytesBYo_full_DEG_edgeR[!names(result_monocytesBYo_full_DEG_edgeR) %in% c("genes")]
result_monocytesBYo_FC5.00_DEG_edgeR<- result_monocytesBYo_full_DEG_edgeR[which(result_monocytesBYo_full_DEG_edgeR$FC >= 5.00), ]

dim(result_monocytesBYo_full_DEG_edgeR)
## [1] 22123     7
dim(result_monocytesBYo_FC5.00_DEG_edgeR)
## [1] 3538    7
write.csv(result_monocytesBYo_full_DEG_edgeR, "Step17_result_monocytesBYo_full_DEG_edgeR.csv")
write.csv(result_monocytesBYo_FC5.00_DEG_edgeR, "Step17_result_monocytesBYo_FC5.00_DEG_edgeR.csv")

summary(result_monocytesBYo_full_DEG_edgeR)
##      logFC              logCPM             LR              PValue       
##  Min.   :-10.1683   Min.   : 1.052   Min.   :  0.000   Min.   :0.00000  
##  1st Qu.: -0.6889   1st Qu.: 1.391   1st Qu.:  0.265   1st Qu.:0.05486  
##  Median :  0.2351   Median : 2.443   Median :  1.251   Median :0.26331  
##  Mean   :  0.4713   Mean   : 3.015   Mean   :  3.533   Mean   :0.35270  
##  3rd Qu.:  1.5283   3rd Qu.: 4.096   3rd Qu.:  3.686   3rd Qu.:0.60671  
##  Max.   : 15.6919   Max.   :14.710   Max.   :100.093   Max.   :1.00000  
##    adj.P.Val        Contrast               FC          
##  Min.   :0.0000   Length:22123       Min.   :    0.00  
##  1st Qu.:0.2194   Class :character   1st Qu.:    0.62  
##  Median :0.5266   Mode  :character   Median :    1.18  
##  Mean   :0.5117                      Mean   :   13.03  
##  3rd Qu.:0.8089                      3rd Qu.:    2.88  
##  Max.   :1.0000                      Max.   :52934.67
summary(result_monocytesBYo_FC5.00_DEG_edgeR)
##      logFC            logCPM             LR              PValue        
##  Min.   : 2.322   Min.   : 1.061   Min.   : 0.8527   Min.   :0.000000  
##  1st Qu.: 2.818   1st Qu.: 1.187   1st Qu.: 2.9195   1st Qu.:0.001724  
##  Median : 3.414   Median : 1.450   Median : 4.4321   Median :0.035269  
##  Mean   : 3.891   Mean   : 2.288   Mean   : 8.5419   Mean   :0.053914  
##  3rd Qu.: 4.465   3rd Qu.: 2.636   3rd Qu.: 9.8227   3rd Qu.:0.087513  
##  Max.   :15.692   Max.   :14.710   Max.   :79.7524   Max.   :0.355785  
##    adj.P.Val         Contrast               FC          
##  Min.   :0.00000   Length:3538        Min.   :    5.00  
##  1st Qu.:0.01891   Class :character   1st Qu.:    7.05  
##  Median :0.16701   Mode  :character   Median :   10.66  
##  Mean   :0.17139                      Mean   :   74.48  
##  3rd Qu.:0.28541                      3rd Qu.:   22.09  
##  Max.   :0.62172                      Max.   :52934.67
#Bind all differentally expressed full genes into a single table
result_allContrasts_full_DEG_edgeR<-rbind(result_monocytesBYo_full_DEG_edgeR)#here only one table so rbind not required
dim(result_allContrasts_full_DEG_edgeR)
## [1] 22123     7
write.csv(result_allContrasts_full_DEG_edgeR, "Step17_result_allContrasts_full_DEG_edgeR.csv")

#Bind all differentally expressed pval genes into a single table
result_allContrasts_FC5.00_DEG_edgeR<-rbind(result_monocytesBYo_FC5.00_DEG_edgeR)#here only one table so rbind not required
dim(result_allContrasts_FC5.00_DEG_edgeR)
## [1] 3538    7
write.csv(result_allContrasts_FC5.00_DEG_edgeR, "Step17_result_allContrasts_FC5.00_DEG_edgeR.csv")

Step17: Visualization of differential gene expression results from DEG_edgeR

#https://www.biostars.org/p/203876/
pdf(file="Step17_plot_Volcano_result_adjFC5.00_DEG_edgeR.pdf",height=10,width=10)
#par(mfrow=c(2,2))

df<-result_monocytesBYo_full_DEG_edgeR
df$gene<-rownames(result_monocytesBYo_full_DEG_edgeR) #create column for genes
mutateddf <- mutate(df, sig=ifelse(df$adj.P.Val<0.05, "adj.P.Val<0.05", "Not Sig")) 
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
       geom_point(aes(col=sig)) + #add points colored by significance
       scale_color_manual(values=c("red", "black")) + 
       ggtitle("Volcanoplot DEG_edgeR")+
       geom_text_repel(data=head(mutateddf, 10), aes(label=gene)) 
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg") 
volc

dev.off()
## png 
##   2
#https://www.biostars.org/p/203876/
pdf(file="Step17_plot_Volcano_result_adjPValFC_DEG_edgeR.pdf",height=10,width=10)
#par(mfrow=c(2,2))

df<-result_monocytesBYo_full_DEG_edgeR
df$gene<-rownames(result_monocytesBYo_full_DEG_edgeR) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig")) 
#Will have different colors depending on significance
#volcanoplot with logFC versus adj.P.Val
volc = ggplot(mutateddf, aes(logFC, -log10(adj.P.Val))) +
       geom_point(aes(col=sig)) + #add points colored by significance
       scale_color_manual(values=c("black", "red")) + 
       ggtitle("Volcanoplot DEG_edgeR")+
       geom_text_repel(data=head(mutateddf, 10), aes(label=gene)) 
#adding text for the top 20 genes
#ggsave("Volcanoplot.jpeg", device="jpeg") 
volc

dev.off()
## png 
##   2
df<-result_monocytesBYo_full_DEG_edgeR
df$gene<-rownames(result_monocytesBYo_full_DEG_edgeR) #create column for genes
mutateddf <- mutate(df, sig=ifelse((df$adj.P.Val < 0.05 & abs(df$logFC) > 2.32), "Sig adj.P.Val&FC", "Not Sig"))
#show UP100 or  in bar plot
mutateddf_UP100 <-mutateddf[order(-mutateddf$logFC), ] #descenting order sort
mutateddf_UP100 <- mutateddf_UP100[1:100,] #top100 upregulated genes
  
#ggplot colors available http://sape.inf.usi.ch/quick-reference/ggplot2/colour
pdf(file="Step17_plot_barplot_result_adjPValUP100_DEG_edgeR.pdf",height=5,width=10)
#Will have different colors depending on significance
bar1<-ggplot(mutateddf_UP100, aes(x=gene, y=logFC, width=.7))+
  geom_bar(position="stack", fill="red4", stat="identity")+
  labs(title=paste0("Barplot top genes with UP100 with significant adj.P.Val"))+
  theme(axis.text.x=element_text(angle=90,hjust=1))
  #labs(x="genes")+
  #labs(y="fold change")+
  #labs(caption="DEG_edgeR")+
  #scale_fill_gradient(low="black",high="red")+
  #theme(plot.margin = margin(2,2,2,2, "cm"), plot.background = element_rect(fill = "white"))
print(bar1)
dev.off()
## png 
##   2
edata_keepDEG_edgeR_UP100<- edata[which(rownames(edata) %in% mutateddf_UP100$gene), ]
dim(mutateddf_UP100)
## [1] 100   9
dim(edata_keepDEG_edgeR_UP100)
## [1] 100  30
pdf(file="Step17_plot_heatmap_result_adjPValUP100_DEG_edgeR.pdf",height=15,width=15)
  #mapdata<-edata[rownames(edata)==mutateddf$gene,]
  mapdata<-edata_keepDEG_edgeR_UP100
  mapdata_matrix <- data.matrix(mapdata)
  colfunc<-colorRampPalette(c("red","green")) #because we are blue group! 
  hr<-hclust(as.dist(1-cor(t(mapdata_matrix),method="pearson")),method="complete")
  plotheatmap<-heatmap.2(mapdata_matrix,Rowv=as.dendrogram(hr),scale="row",density.info="none",trace="none",col=colfunc(256),cexRow=0.4,cexCol=0.4)
dev.off()
## png 
##   2

Step18: Keep only DEG_edgeR genes from the edata

#genes which are sig differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_edgeR)
## [1] 3538    7
DEG_edgeRnames=rownames(result_allContrasts_FC5.00_DEG_edgeR)
head(DEG_edgeRnames)
## [1] "Chil3"    "Plbd1"    "Emb"      "AI839979" "Gm21188"  "Clec12a"
length(DEG_edgeRnames)
## [1] 3538
#save result_allContrasts_full_DEG_edgeR, result_allContrasts_FC5.00_DEG_edgeR, DEG_edgeRnames
save(result_allContrasts_full_DEG_edgeR, result_allContrasts_FC5.00_DEG_edgeR, DEG_edgeRnames, file="DEG_edgeR_data.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123    30
edata_keepDEG_edgeR<- edata[which(rownames(edata) %in% DEG_edgeRnames), ]
dim(edata_keepDEG_edgeR)
## [1] 3538   30
#unique and common genes between the different contrasts DEG_edgeR are retained
DT::datatable(edata_keepDEG_edgeR[1:7,1:7])
#Export edata keepDEG_edgeR genes only
write.csv(edata_keepDEG_edgeR, "Step18_result_edata_keepDEG_edgeR.csv")

Step18: Visualization of edata DEG_edgeR filter. Note that unfiltered was done under pca_unscaled

SVA none and keepDEG_edgeR

edata_keepDEG_edgeR filtered to keepDEG_edgeR genes

#Summary Statistics edata_keepDEG_edgeR
DT::datatable(summary(edata_keepDEG_edgeR[1:7,1:7]))
write.csv(summary(edata_keepDEG_edgeR),"Step18_result_summary_stats_edata_keepDEG_edgeR.csv")
pdf(file="Step18_plot_Visualization_boxplot_MDS_PC_keepDEG_edgeR_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))

                            ############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]

                            ############boxplot chunk###############
#Before boxplot edata_keepDEG_edgeR
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_edgeR, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_edgeR", col="yellow")

                            ############MDSplot chunk###############
#Before MDSplot edata_keepDEG_edgeR
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_edgeR, col=condition_colors, legend= "all", main="edata_keepDEG_edgeR", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
                    #######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_edgeR, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_edgeR), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                        PC1       PC2      PC3
## astro_SRR1033783 -16.61836 -22.86241 2.043632
## astro_SRR1033784 -15.91182 -23.81046 2.533337
## endo_SRR1033795  -18.05997 -13.83223 1.989713
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_edgeR",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$CellType,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                             #########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_edgeR, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_edgeR), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                        PC1       PC2      PC3
## astro_SRR1033783 -16.61836 -22.86241 2.043632
## astro_SRR1033784 -15.91182 -23.81046 2.533337
## endo_SRR1033795  -18.05997 -13.83223 1.989713
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_edgeR",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Species,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                                 #########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_edgeR, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_edgeR), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                        PC1       PC2      PC3
## astro_SRR1033783 -16.61836 -22.86241 2.043632
## astro_SRR1033784 -15.91182 -23.81046 2.533337
## endo_SRR1033795  -18.05997 -13.83223 1.989713
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_edgeR",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Age,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

dev.off()
## png 
##   2

Step18: Preparing aggregate data expr or edata_keepDEG_edgeR from metadata by variables

t_edata_keepDEG_edgeR<-t(edata_keepDEG_edgeR)
DT::datatable(t_edata_keepDEG_edgeR[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_edgeR)
## [1]   30 3538
dim(pheno)
## [1] 30  5
#bind the pheno and edata_keepDEG_edgeR, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_edgeR<-cbind(pheno, t_edata_keepDEG_edgeR)
dim(pheno_t_edata_keepDEG_edgeR)
## [1]   30 3543
DT::datatable(pheno_t_edata_keepDEG_edgeR[,1:10])
write.csv(pheno_t_edata_keepDEG_edgeR, "Step18_result_pheno_t_edata_keepDEG_edgeR_All_Group_aggregate.csv")

Aggregade by CellType

#select the CellType pheno drop others
pheno_t_edata_keepDEG_edgeR_CellType=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_edgeR_CellType)
## [1]   30 3539
DT::datatable(pheno_t_edata_keepDEG_edgeR_CellType[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_CellType)
pheno_t_edata_keepDEG_edgeR_CellType_avg=pheno_t_edata_keepDEG_edgeR_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_CellType_avg=t(pheno_t_edata_keepDEG_edgeR_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_CellType_avg)=pheno_t_edata_keepDEG_edgeR_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_edgeR_CellType_avg)
## [1] 3538    2
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_CellType_avg, "Step18_result_edata_keepDEG_edgeR_CellType_aggregate.csv")

Aggregade by Species

#select the Species pheno drop others
pheno_t_edata_keepDEG_edgeR_Species=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_edgeR_Species)
## [1]   30 3539
DT::datatable(pheno_t_edata_keepDEG_edgeR_Species[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_Species)
pheno_t_edata_keepDEG_edgeR_Species_avg=pheno_t_edata_keepDEG_edgeR_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_Species_avg=t(pheno_t_edata_keepDEG_edgeR_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_Species_avg)=pheno_t_edata_keepDEG_edgeR_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_edgeR_Species_avg)
## [1] 3538    2
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_Species_avg, "Step18_result_edata_keepDEG_edgeR_Species_aggregate.csv")

Aggregade by Age

#select the Age pheno drop others
pheno_t_edata_keepDEG_edgeR_Age=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_edgeR_Age)
## [1]   30 3539
DT::datatable(pheno_t_edata_keepDEG_edgeR_Age[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_Age)
pheno_t_edata_keepDEG_edgeR_Age_avg=pheno_t_edata_keepDEG_edgeR_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_Age_avg=t(pheno_t_edata_keepDEG_edgeR_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_Age_avg)=pheno_t_edata_keepDEG_edgeR_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_edgeR_Age_avg)
## [1] 3538    2
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_Age_avg, "Step18_result_edata_keepDEG_edgeR_Age_aggregate.csv")

Aggregade by Tissue

#select the Tissue pheno drop others
pheno_t_edata_keepDEG_edgeR_Tissue=pheno_t_edata_keepDEG_edgeR[!names(pheno_t_edata_keepDEG_edgeR) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_edgeR_Tissue)
## [1]   30 3539
DT::datatable(pheno_t_edata_keepDEG_edgeR_Tissue[,1:10])
#aggregate edata_keepDEG_edgeR or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_edgeR_Tissue)
pheno_t_edata_keepDEG_edgeR_Tissue_avg=pheno_t_edata_keepDEG_edgeR_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_edgeR_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_edgeR_Tissue_avg=t(pheno_t_edata_keepDEG_edgeR_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)=pheno_t_edata_keepDEG_edgeR_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)
## [1] 3538    4
DT::datatable(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, "Step18_result_edata_keepDEG_edgeR_Tissue_aggregate.csv")

Aggregade all Groups

t_pheno_t_edata_keepDEG_edgeR_all_avg<-cbind(t_pheno_t_edata_keepDEG_edgeR_CellType_avg, t_pheno_t_edata_keepDEG_edgeR_Species_avg, t_pheno_t_edata_keepDEG_edgeR_Age_avg, t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_edgeR_all_avg))
write.csv(t_pheno_t_edata_keepDEG_edgeR_all_avg, "Step18_result_edata_keepDEG_edgeR_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_edgeR, t_pheno_t_edata_keepDEG_edgeR_CellType_avg, t_pheno_t_edata_keepDEG_edgeR_Species_avg, t_pheno_t_edata_keepDEG_edgeR_Age_avg, t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, t_pheno_t_edata_keepDEG_edgeR_all_avg
save(pheno_t_edata_keepDEG_edgeR, t_pheno_t_edata_keepDEG_edgeR_CellType_avg, t_pheno_t_edata_keepDEG_edgeR_Species_avg, t_pheno_t_edata_keepDEG_edgeR_Age_avg, t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, t_pheno_t_edata_keepDEG_edgeR_all_avg, file="edata_keepDEG_edgeR_aggregate_data.RData")

Step18: Visualization by heatmap and correlation of aggregate data expr or edata_keepDEG_edgeR from metadata by variables

pdf(file="Step18_plot_boxplot_corr_plot_edata_keepDEG_edgeR_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))

                    #########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_edgeR_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_CellType_avg", col="yellow")

#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_edgeR_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_Species_avg", col="yellow")

#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_edgeR_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_Age_avg", col="yellow")

#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_Tissue_avg", col="yellow")

#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_edgeR_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_edgeR_all_avg", col="yellow")

                    #########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_CellType_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_Species_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_Age_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_Tissue_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_edgeR_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step18_result_edata_keepDEG_edgeR_all_Group_corr_pval_aggregate.csv")

dev.off()
## png 
##   2
pdf(file="Step18_plot_density_edata_keepDEG_edgeR_aka_expr_aggregate.pdf",height=10,width=10)

                    #########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_edgeR_CellType_avg", y = "Density")


#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_edgeR_Species_avg", y = "Density")
  

#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_edgeR_Age_avg", y = "Density")

#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_edgeR_Tissue_avg", y = "Density")

#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_edgeR_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_edgeR_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_edgeR_all_avg", y = "Density") 

dev.off()
## png 
##   2

Clear up workspace and retain required dataframes

#save pheno, edata 
save(pheno, metadata_1, edata, edata_keepDEG_edgeR, file="edata_keepDEG_edgeR_data.RData")
#save image
save.image(file="DEG_edgeR_temp.RData")
rm(list=ls())
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 4048911 216.3    8073370 431.2  8073370 431.2
## Vcells 7892956  60.3   28223680 215.4 35279601 269.2

Step19: Differentially expressed genes DEG_Cuff division of arithmetic means of expression data so easier to interpret like cuffdiff (published CGGA code)

Step19: Visualization of differential gene expression results from DEG_Cuff

Step20: Keep only DEG_Cuff genes from the edata

Step20: Visualization of edata DEG_Cuff filter. Note that unfiltered was done under pca_unscaled

Step20: Preparing aggregate data expr or edata_keepDEG_Cuff from metadata by variables

Step20: Visualization by heatmap and correlation of aggregate data expr or edata_keepDEG_Cuff from metadata by variables

Step21: Comparison of overlap between DEG_Limma DEG_Simple DEG_edgeR and (if available) DEG_cuff

#Load pheno, edata required for steps below
load(file="done_pre_Step8.RData")
#Load DEG_edgeR DEG_Limma and DEG_Simple results required for steps below
load(file="DEG_edgeR_data.RData")
load(file="DEG_Limma_data.RData")
load(file="DEG_Simple_data.RData")

gc() 
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4248895 227.0    8073370 431.2  8073370 431.2
## Vcells 17902113 136.6   28223680 215.4 35279601 269.2
#load required libraries
#https://www.bioconductor.org/packages/devel/bioc/vignettes/GeneOverlap/inst/doc/GeneOverlap.pdf
library(GeneOverlap)
library(VennDiagram)
## Warning: package 'VennDiagram' was built under R version 3.5.3
## Loading required package: grid
## Loading required package: futile.logger
## Warning: package 'futile.logger' was built under R version 3.5.3
## 
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
## 
##     scat
#Create a list object of all the DEG genes
DEG_list<-list(DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames)
names(DEG_list)<-c("DEG_edgeRnames","DEG_Limmanames", "DEG_Simplenames")
summary(DEG_list)
##                 Length Class  Mode     
## DEG_edgeRnames  3538   -none- character
## DEG_Limmanames   856   -none- character
## DEG_Simplenames  977   -none- character
#calculate total number of unique genes in the DEG 
size_estimate1=c(DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames)
size_estimate2=unique(size_estimate1)
size_estimate3=length(size_estimate2)
#Calculate gene overlap matrix between the DEG genes from DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames
gom.self <- newGOM(DEG_list, DEG_list, genome.size=size_estimate3)
#export p-value for overlap between DEG genes from DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames
geom.mat<-getMatrix(gom.self, name="pval")
geom.mat
##                 DEG_edgeRnames DEG_Limmanames DEG_Simplenames
## DEG_edgeRnames    0.000000e+00   1.896785e-97               1
## DEG_Limmanames    1.896785e-97   0.000000e+00               1
## DEG_Simplenames   1.000000e+00   1.000000e+00               0
write.csv(geom.mat, "Step21_result_p-value_overlap_DEG_all.csv") 
#Here the genes are counted for each list of genes and overlap is shown
gom.self["DEG_edgeRnames", "DEG_Limmanames"]
## GeneOverlap object:
## listA size=3538
## listB size=856
## Intersection size=854
## Overlapping p-value=1.9e-97
## Jaccard Index=0.2
gom.self["DEG_edgeRnames", "DEG_Simplenames"]
## GeneOverlap object:
## listA size=3538
## listB size=977
## Intersection size=9
## Overlapping p-value=1
## Jaccard Index=0.0
gom.self["DEG_Limmanames", "DEG_Simplenames"]
## GeneOverlap object:
## listA size=856
## listB size=977
## Intersection size=0
## Overlapping p-value=1
## Jaccard Index=0.0

Step21: Visualization of overlap between DEG_Limma DEG_Simple and DEG_edgeR

pdf(file="Step21_plot_Visualization_heatmap_overlap_DEG_all.pdf",height=10,width=10)
#par(mfrow=c(2,1))

drawHeatmap(gom.self, what = "Jaccard")
#"odds.ratio"   "Jaccard"

dev.off()
## png 
##   2
pdf(file="Step21_plot_Visualization_vennDiagram_overlap_DEG_all.pdf",height=10,width=10)
#par(mfrow=c(2,1))

grid.draw(venn.diagram(list(DEG_edgeRnames = DEG_edgeRnames, DEG_Limmanames = DEG_Limmanames, 
    DEG_Simplenames = DEG_Simplenames), filename = NULL, fill = c("blue", "orange", 
    "green")))

dev.off()
## png 
##   2

Step22: Keep only DEG_edgeRLimmaSimple genes from the edata

Keep only DEG_edgeRLimmaSimple genes from the DEG_Limma, DEG_edgeR and DEG_Simple results

#save names of genes overlaped for DEG_edgeRnames,DEG_Limmanames, DEG_Simplenames in atleast 2
DEG_AllDEGmethodsnames_3grs=intersect(intersect(DEG_edgeRnames, DEG_Limmanames), DEG_Simplenames)


DEG_AllDEGmethodsnames_2grs_1=intersect(DEG_edgeRnames, DEG_Limmanames)
DEG_AllDEGmethodsnames_2grs_2=intersect(DEG_edgeRnames, DEG_Simplenames)
DEG_AllDEGmethodsnames_2grs_3=intersect(DEG_Limmanames, DEG_Simplenames)


DEG_AllDEGmethodsnames=cbind(DEG_AllDEGmethodsnames_3grs,DEG_AllDEGmethodsnames_2grs_1,DEG_AllDEGmethodsnames_2grs_2,DEG_AllDEGmethodsnames_2grs_3)
## Warning in cbind(DEG_AllDEGmethodsnames_3grs,
## DEG_AllDEGmethodsnames_2grs_1, : number of rows of result is not a multiple
## of vector length (arg 3)
head(DEG_AllDEGmethodsnames)
##      DEG_AllDEGmethodsnames_2grs_1 DEG_AllDEGmethodsnames_2grs_2
## [1,] "Chil3"                       "Gm18026"                    
## [2,] "Plbd1"                       "Gm47851"                    
## [3,] "Emb"                         "Gm13411"                    
## [4,] "AI839979"                    "Gm22472"                    
## [5,] "Gm21188"                     "Gm42666"                    
## [6,] "Clec12a"                     "Gm25189"
length(DEG_AllDEGmethodsnames)
## [1] 1708
#Keep only those genes which are differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Limma)
## [1] 856   8
result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods<-result_allContrasts_FC5.00_DEG_Limma[which(rownames(result_allContrasts_FC5.00_DEG_Limma) %in% DEG_AllDEGmethodsnames), ]
dim(result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods)
## [1] 854   8
#unique and common genes between the different contrasts DEG_edgeRLimmaSimple are retained
DT::datatable(result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods[1:7,])
#Export edata keepDEG_edgeRLimmaSimple genes only
write.csv(result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods, "Step22_result_allContrasts_FC5.00_DEG_Limma_AllDEGmethods.csv")

#Keep only those genes which are differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_edgeR)
## [1] 3538    7
result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods<-result_allContrasts_FC5.00_DEG_edgeR[which(rownames(result_allContrasts_FC5.00_DEG_edgeR) %in% DEG_AllDEGmethodsnames), ]
dim(result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods)
## [1] 863   7
#unique and common genes between the different contrasts DEG_edgeRedgeRSimple are retained
DT::datatable(result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods[1:7,])
#Export edata keepDEG_edgeRedgeRSimple genes only
write.csv(result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods, "Step22_result_allContrasts_FC5.00_DEG_edgeR_AllDEGmethods.csv")

#Keep only those genes which are differentially expressed in contrasts
dim(result_allContrasts_FC5.00_DEG_Simple)
## [1] 977   8
result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods<-result_allContrasts_FC5.00_DEG_Simple[which(rownames(result_allContrasts_FC5.00_DEG_Simple) %in% DEG_AllDEGmethodsnames), ]
dim(result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods)
## [1] 9 8
#unique and common genes between the different contrasts DEG_SimpleSimpleSimple are retained
DT::datatable(result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods[1:7,])
#Export edata keepDEG_SimpleSimpleSimple genes only
write.csv(result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods, "Step22_result_allContrasts_FC5.00_DEG_Simple_AllDEGmethods.csv")
#save DEG_AllDEGmethodsnames
save(DEG_AllDEGmethodsnames, file="DEG_AllDEGmethodsnames.RData")
#Keep only those genes which are differentially expressed in contrasts
dim(edata)
## [1] 22123    30
edata_keepDEG_AllDEGmethods<- edata[which(rownames(edata) %in% DEG_AllDEGmethodsnames), ]
dim(edata_keepDEG_AllDEGmethods)
## [1] 863  30
#unique and common genes between the different contrasts DEG_AllDEGmethods are retained
DT::datatable(edata_keepDEG_AllDEGmethods[1:7,1:7])
#Export edata keepDEG_AllDEGmethods genes only
write.csv(edata_keepDEG_AllDEGmethods, "Step22_result_edata_keepDEG_AllDEGmethods.csv")

Step22: Visualization of edata DEG_AllDEGmethods filter. Note that unfiltered was done under pca_unscaled

SVA none and keepDEG_AllDEGmethods

edata_keepDEG_AllDEGmethods filtered to keepDEG_AllDEGmethods genes

#Summary Statistics edata_keepDEG_AllDEGmethods
DT::datatable(summary(edata_keepDEG_AllDEGmethods[1:7,1:7]))
write.csv(summary(edata_keepDEG_AllDEGmethods),"Step22_result_summary_stats_edata_keepDEG_AllDEGmethods.csv")
pdf(file="Step22_plot_Visualization_boxplot_MDS_PC_keepDEG_AllDEGmethods_for_edata.pdf",height=10,width=10)
par(mfrow=c(2,1))

                            ############set colors for chunk Study###############
nconditions <- nlevels(as.factor(pheno$Study))
npal <- colorRampPalette(brewer.pal(nconditions, "Set1"))(nconditions)
condition_colors <- npal[as.integer(pheno$Study)]

                            ############boxplot chunk###############
#Before boxplot edata_keepDEG_AllDEGmethods
par(mai=c(1,0.8,1,0.8))
boxplot(edata_keepDEG_AllDEGmethods, outline=FALSE, las=2, cex=0.25, main="edata_keepDEG_AllDEGmethods", col="yellow")

                            ############MDSplot chunk###############
#Before MDSplot edata_keepDEG_AllDEGmethods
par(mai=c(1,0.8,1,0.8))
plotMDS(edata_keepDEG_AllDEGmethods, col=condition_colors, legend= "all", main="edata_keepDEG_AllDEGmethods", cex=0.5)#like PCA plot
## Warning in plot.window(...): "legend" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "legend" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "legend" is
## not a graphical parameter
## Warning in box(...): "legend" is not a graphical parameter
## Warning in title(...): "legend" is not a graphical parameter
## Warning in text.default(x$x, x$y, labels = labels, cex = cex, ...):
## "legend" is not a graphical parameter
                    #######PCAplot chunk CellType colored by Study or batch########
#Before PCAplot edata_keepDEG_AllDEGmethods, colored by Study or batch, labels CellType
pca0=prcomp(t(edata_keepDEG_AllDEGmethods), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -10.329311 -9.764326 7.5761186
## astro_SRR1033784  -9.528273 -9.819515 8.5336195
## endo_SRR1033795  -10.071173 -5.832391 0.9841376
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_AllDEGmethods",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$CellType,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                             #########PCAplot chunk Species colored by Study or batch######
#Before PCAplot edata_keepDEG_AllDEGmethods, colored by Study or batch, labels Species
pca0=prcomp(t(edata_keepDEG_AllDEGmethods), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -10.329311 -9.764326 7.5761186
## astro_SRR1033784  -9.528273 -9.819515 8.5336195
## endo_SRR1033795  -10.071173 -5.832391 0.9841376
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_AllDEGmethods",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Species,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

                                 #########PCAplot chunk Age colored by Study or batch########
#Before PCAplot edata_keepDEG_AllDEGmethods, colored by Study or batch, labels Age
pca0=prcomp(t(edata_keepDEG_AllDEGmethods), center=T, scale=T)
names(pca0)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#summary(pca0)
pca0$x[1:3,1:3]
##                         PC1       PC2       PC3
## astro_SRR1033783 -10.329311 -9.764326 7.5761186
## astro_SRR1033784  -9.528273 -9.819515 8.5336195
## endo_SRR1033795  -10.071173 -5.832391 0.9841376
plot(x=pca0$x[,1], y=pca0$x[,2],
     cex=0,
     xlab="PC1", ylab="PC2",
     main="PC plot colored by Study edata_keepDEG_AllDEGmethods",
     font=2
)
text(x=pca0$x[,1], y=pca0$x[,2],
     labels=metadata_1$Age,
     col=as.numeric(metadata_1$Study),
     cex=0.7, font=2)
legend("bottomright",
       legend=c("GSE52564_to_SRP035309"),
       text.col=c("red", "blue", "green")
)

dev.off()
## png 
##   2

Step22: Preparing aggregate data expr or edata_keepDEG_AllDEGmethods from metadata by variables

t_edata_keepDEG_AllDEGmethods<-t(edata_keepDEG_AllDEGmethods)
DT::datatable(t_edata_keepDEG_AllDEGmethods[,1:7])
DT::datatable(pheno)
dim(t_edata_keepDEG_AllDEGmethods)
## [1]  30 863
dim(pheno)
## [1] 30  5
#bind the pheno and edata_keepDEG_AllDEGmethods, this file can be used to predict trait based on a particular gene's gene expression
pheno_t_edata_keepDEG_AllDEGmethods<-cbind(pheno, t_edata_keepDEG_AllDEGmethods)
dim(pheno_t_edata_keepDEG_AllDEGmethods)
## [1]  30 868
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods[,1:10])
write.csv(pheno_t_edata_keepDEG_AllDEGmethods, "Step22_result_pheno_t_edata_keepDEG_AllDEGmethods_All_Group_aggregate.csv")

Aggregade by CellType

#select the CellType pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_CellType=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("Age", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_CellType)
## [1]  30 864
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_CellType[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the CellType pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_CellType)
pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg=pheno_t_edata_keepDEG_AllDEGmethods_CellType[, lapply(.SD, mean), by = .(CellType)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg)=pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg$CellType
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg)
## [1] 863   2
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, "Step22_result_edata_keepDEG_AllDEGmethods_CellType_aggregate.csv")

Aggregade by Species

#select the Species pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_Species=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("Age", "Study", "CellType", "Tissue")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_Species)
## [1]  30 864
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Species[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the Species pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_Species)
pheno_t_edata_keepDEG_AllDEGmethods_Species_avg=pheno_t_edata_keepDEG_AllDEGmethods_Species[, lapply(.SD, mean), by = .(Species)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Species_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_Species_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg)=pheno_t_edata_keepDEG_AllDEGmethods_Species_avg$Species
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg)
## [1] 863   2
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, "Step22_result_edata_keepDEG_AllDEGmethods_Species_aggregate.csv")

Aggregade by Age

#select the Age pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_Age=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("CellType", "Study", "Species", "Tissue")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_Age)
## [1]  30 864
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Age[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the Age pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_Age)
pheno_t_edata_keepDEG_AllDEGmethods_Age_avg=pheno_t_edata_keepDEG_AllDEGmethods_Age[, lapply(.SD, mean), by = .(Age)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Age_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_Age_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg)=pheno_t_edata_keepDEG_AllDEGmethods_Age_avg$Age
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg)
## [1] 863   2
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, "Step22_result_edata_keepDEG_AllDEGmethods_Age_aggregate.csv")

Aggregade by Tissue

#select the Tissue pheno drop others
pheno_t_edata_keepDEG_AllDEGmethods_Tissue=pheno_t_edata_keepDEG_AllDEGmethods[!names(pheno_t_edata_keepDEG_AllDEGmethods) %in% c("CellType", "Study", "Species", "Age")]
dim(pheno_t_edata_keepDEG_AllDEGmethods_Tissue)
## [1]  30 864
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Tissue[,1:10])
#aggregate edata_keepDEG_AllDEGmethods or expression for the Tissue pheno
#https://campus.datacamp.com/courses/data-table-data-manipulation-r-tutorial/chapter-two-datatable-yeoman?ex=6
library(data.table)
setDT(pheno_t_edata_keepDEG_AllDEGmethods_Tissue)
pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg=pheno_t_edata_keepDEG_AllDEGmethods_Tissue[, lapply(.SD, mean), by = .(Tissue)]
DT::datatable(pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg[,1:10])
#transpose genes to rows and variable to column
t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg=t(pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg[,-1])
colnames(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)=pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg$Tissue
dim(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)
## [1] 863   4
DT::datatable(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg[1:10,])
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, "Step22_result_edata_keepDEG_AllDEGmethods_Tissue_aggregate.csv")

Aggregade all Groups

t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg<-cbind(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)
DT::datatable(head(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg))
write.csv(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg, "Step22_result_edata_keepDEG_AllDEGmethods_All_Group_aggregate.csv")
#save pheno_t_edata_keepDEG_AllDEGmethods, t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg
save(pheno_t_edata_keepDEG_AllDEGmethods, t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg, file="edata_keepDEG_AllDEGmethods_aggregate_data.RData")

Step22: Visualization by heatmap and correlation of aggregate data expr or edata_keepDEG_AllDEGmethods from metadata by variables

pdf(file="Step22_plot_boxplot_corr_plot_edata_keepDEG_AllDEGmethods_aka_expr_aggregate.pdf",height=10,width=10)
par(mfrow=c(2,1))

                    #########boxplot CellType Species Age Tissue and all ########
#box plot on aggregate CellType
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg", col="yellow")

#box plot on aggregate Species
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg", col="yellow")

#box plot on aggregate Age
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg", col="yellow")

#box plot on aggregate Tissue
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg", col="yellow")

#box plot on aggregate all
boxplot(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg, outline=FALSE, las=2, cex=0.25, main="t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg", col="yellow")

                    #########corr plot CellType Species Age Tissue and all ########
#create cor matrix on aggregate CellType
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_CellType_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Species
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_Species_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Age
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_Age_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate Tissue
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_Tissue_Group_corr_pval_aggregate.csv")

#create cor matrix on aggregate all
rel2 <- rcorr(as.matrix(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg))
# Extract the correlation coefficients
rel2_r<-rel2$r
# Extract p-values
rel2_P<-rel2$P
#plot corelations and p-value
plot_corrdata<-corrplot(rel2$r, type="full", order="hclust", p.mat = rel2$P, sig.level = 0.01, insig = "blank")
#export corr data
write.csv(cbind(rel2_r, rel2_P),"Step22_result_edata_keepDEG_AllDEGmethods_all_Group_corr_pval_aggregate.csv")

dev.off()
## png 
##   2
pdf(file="Step22_plot_density_edata_keepDEG_AllDEGmethods_aka_expr_aggregate.pdf",height=10,width=10)

                    #########density plot CellType Species Age Tissue and all ########
#density plot on aggregate CellType
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for CellType",x="t_pheno_t_edata_keepDEG_AllDEGmethods_CellType_avg", y = "Density")


#density plot on aggregate Species
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Species",x="t_pheno_t_edata_keepDEG_AllDEGmethods_Species_avg", y = "Density")
  

#density plot on aggregate Age
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Age",x="t_pheno_t_edata_keepDEG_AllDEGmethods_Age_avg", y = "Density")

#density plot on aggregate Tissue
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for Tissue",x="t_pheno_t_edata_keepDEG_AllDEGmethods_Tissue_avg", y = "Density")

#density plot on aggregate all
df=as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg)
df.m <- melt(as.data.frame(t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg))
## No id variables; using all as measure variables
library(ggplot2)
ggplot(df.m, aes(fill=variable)) + 
  geom_density(aes(x = value, colour = variable)) + labs(x = NULL)+
  labs(title="gene expression density curve for all",x="t_pheno_t_edata_keepDEG_AllDEGmethods_all_avg", y = "Density") 

dev.off()
## png 
##   2

Clear up workspace and retain required dataframes

#save pheno, edata required for WGCNA
save(pheno, metadata_1, edata, edata_keepDEG_AllDEGmethods, file="edata_keepDEG_AllDEGmethods_data.RData")
#save image
save.image(file="DEG_AllDEGmethods_temp.RData")
rm(list=ls())
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 4115781 219.9    8073370 431.2  8073370 431.2
## Vcells 7978274  60.9   28223680 215.4 35279601 269.2

End of DEG pipeline, uses edata and modified metadata

Step23:Organization and saving session (software version) information

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 16299)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] VennDiagram_1.6.20  futile.logger_1.4.3 GeneOverlap_1.16.0 
##  [4] edgeR_3.22.5        dplyr_0.8.0.1       ggrepel_0.8.1      
##  [7] data.table_1.12.2   expss_0.8.11        filesstrings_3.0.0 
## [10] stringr_1.4.0       corrplot_0.84       viridis_0.5.1      
## [13] viridisLite_0.3.0   Hmisc_4.2-0         Formula_1.2-3      
## [16] lattice_0.20-38     RColorBrewer_1.1-2  gplots_3.0.1.1     
## [19] ggmap_3.0.0         ggplot2_3.1.1       pamr_1.56.1        
## [22] survival_2.44-1.1   cluster_2.0.8       sva_3.28.0         
## [25] BiocParallel_1.14.2 genefilter_1.62.0   mgcv_1.8-28        
## [28] nlme_3.1-137        limma_3.36.5       
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1     rjson_0.2.20         htmlTable_1.13.1    
##  [4] base64enc_0.1-3      rstudioapi_0.10      DT_0.5              
##  [7] bit64_0.9-7          AnnotationDbi_1.42.1 splines_3.5.2       
## [10] knitr_1.22           jsonlite_1.6         annotate_1.58.0     
## [13] png_0.1-7            shiny_1.3.2          compiler_3.5.2      
## [16] httr_1.4.0           backports_1.1.4      assertthat_0.2.1    
## [19] Matrix_1.2-17        lazyeval_0.2.2       later_0.8.0         
## [22] formatR_1.6          acepack_1.4.1        htmltools_0.3.6     
## [25] tools_3.5.2          gtable_0.3.0         glue_1.3.1          
## [28] reshape2_1.4.3       Rcpp_1.0.1           Biobase_2.40.0      
## [31] gdata_2.18.0         crosstalk_1.0.0      xfun_0.6            
## [34] mime_0.6             gtools_3.8.1         XML_3.98-1.19       
## [37] scales_1.0.0         promises_1.0.1       parallel_3.5.2      
## [40] lambda.r_1.2.3       yaml_2.2.0           strex_0.1.3         
## [43] memoise_1.1.0        gridExtra_2.3        rpart_4.1-15        
## [46] latticeExtra_0.6-28  stringi_1.4.3        RSQLite_2.1.1       
## [49] S4Vectors_0.18.3     checkmate_1.9.1      caTools_1.17.1.2    
## [52] BiocGenerics_0.26.0  RgoogleMaps_1.4.3    rlang_0.3.4         
## [55] pkgconfig_2.0.2      matrixStats_0.54.0   bitops_1.0-6        
## [58] evaluate_0.13        purrr_0.3.2          htmlwidgets_1.3     
## [61] labeling_0.3         bit_1.1-14           tidyselect_0.2.5    
## [64] plyr_1.8.4           magrittr_1.5         R6_2.4.0            
## [67] IRanges_2.14.12      DBI_1.0.0            pillar_1.3.1        
## [70] foreign_0.8-71       withr_2.1.2          RCurl_1.95-4.12     
## [73] nnet_7.3-12          tibble_2.1.1         crayon_1.3.4        
## [76] futile.options_1.0.1 KernSmooth_2.23-15   rmarkdown_1.12      
## [79] jpeg_0.1-8           locfit_1.5-9.1       blob_1.1.1          
## [82] digest_0.6.18        xtable_1.8-4         tidyr_0.8.3         
## [85] httpuv_1.5.1         stats4_3.5.2         munsell_0.5.0
toLatex(sessionInfo())
## \begin{itemize}\raggedright
##   \item R version 3.5.2 (2018-12-20), \verb|x86_64-w64-mingw32|
##   \item Locale: \verb|LC_COLLATE=English_United States.1252|, \verb|LC_CTYPE=English_United States.1252|, \verb|LC_MONETARY=English_United States.1252|, \verb|LC_NUMERIC=C|, \verb|LC_TIME=English_United States.1252|
##   \item Running under: \verb|Windows 10 x64 (build 16299)|
##   \item Matrix products: default
##   \item Base packages: base, datasets, graphics, grDevices, grid,
##     methods, stats, utils
##   \item Other packages: BiocParallel~1.14.2, cluster~2.0.8,
##     corrplot~0.84, data.table~1.12.2, dplyr~0.8.0.1, edgeR~3.22.5,
##     expss~0.8.11, filesstrings~3.0.0, Formula~1.2-3,
##     futile.logger~1.4.3, genefilter~1.62.0, GeneOverlap~1.16.0,
##     ggmap~3.0.0, ggplot2~3.1.1, ggrepel~0.8.1, gplots~3.0.1.1,
##     Hmisc~4.2-0, lattice~0.20-38, limma~3.36.5, mgcv~1.8-28,
##     nlme~3.1-137, pamr~1.56.1, RColorBrewer~1.1-2, stringr~1.4.0,
##     survival~2.44-1.1, sva~3.28.0, VennDiagram~1.6.20,
##     viridis~0.5.1, viridisLite~0.3.0
##   \item Loaded via a namespace (and not attached): acepack~1.4.1,
##     annotate~1.58.0, AnnotationDbi~1.42.1, assertthat~0.2.1,
##     backports~1.1.4, base64enc~0.1-3, Biobase~2.40.0,
##     BiocGenerics~0.26.0, bit~1.1-14, bit64~0.9-7, bitops~1.0-6,
##     blob~1.1.1, caTools~1.17.1.2, checkmate~1.9.1,
##     colorspace~1.4-1, compiler~3.5.2, crayon~1.3.4,
##     crosstalk~1.0.0, DBI~1.0.0, digest~0.6.18, DT~0.5,
##     evaluate~0.13, foreign~0.8-71, formatR~1.6,
##     futile.options~1.0.1, gdata~2.18.0, glue~1.3.1, gridExtra~2.3,
##     gtable~0.3.0, gtools~3.8.1, htmlTable~1.13.1, htmltools~0.3.6,
##     htmlwidgets~1.3, httpuv~1.5.1, httr~1.4.0, IRanges~2.14.12,
##     jpeg~0.1-8, jsonlite~1.6, KernSmooth~2.23-15, knitr~1.22,
##     labeling~0.3, lambda.r~1.2.3, later~0.8.0,
##     latticeExtra~0.6-28, lazyeval~0.2.2, locfit~1.5-9.1,
##     magrittr~1.5, Matrix~1.2-17, matrixStats~0.54.0,
##     memoise~1.1.0, mime~0.6, munsell~0.5.0, nnet~7.3-12,
##     parallel~3.5.2, pillar~1.3.1, pkgconfig~2.0.2, plyr~1.8.4,
##     png~0.1-7, promises~1.0.1, purrr~0.3.2, R6~2.4.0, Rcpp~1.0.1,
##     RCurl~1.95-4.12, reshape2~1.4.3, RgoogleMaps~1.4.3,
##     rjson~0.2.20, rlang~0.3.4, rmarkdown~1.12, rpart~4.1-15,
##     RSQLite~2.1.1, rstudioapi~0.10, S4Vectors~0.18.3,
##     scales~1.0.0, shiny~1.3.2, splines~3.5.2, stats4~3.5.2,
##     strex~0.1.3, stringi~1.4.3, tibble~2.1.1, tidyr~0.8.3,
##     tidyselect~0.2.5, tools~3.5.2, withr~2.1.2, xfun~0.6,
##     XML~3.98-1.19, xtable~1.8-4, yaml~2.2.0
## \end{itemize}
#Organize of files
#library(filesstrings)

#raw, expr normalized and metadata 
dir.create("merged_expr_metadata")
file.move(list.files(pattern = '*rawdata_transformation_FewSamples.pdf'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*rawdata_transformation.pdf'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*SampleID.csv'), "merged_expr_metadata")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*SampleID_coded.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'preSVA_Merging.RData'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'done_pre_Step8.RData'), "merged_expr_metadata")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'GPL*'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*data_non-log'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*data_originalfromGEO'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*Annotation_Expr_GeneMs.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*metadata.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*SampleID_coded.csv'), "merged_expr_metadata")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'preSVA*'), "merged_expr_metadata")
## 0 files moved. 0 failed.
#Aggregate expression by variable  and plots
dir.create("Aggregate_csv_plots")
file.move(list.files(pattern = '*aggregate.csv'), "Aggregate_csv_plots")
## 55 files moved. 0 failed.
file.move(list.files(pattern = '*aggregate.pdf'), "Aggregate_csv_plots")
## 10 files moved. 0 failed.
file.move(list.files(pattern = 'edata_aggregate_data.RData'), "Aggregate_csv_plots")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'edata_lmY_aggregate_data.RData'), "Aggregate_csv_plots")
## 0 files moved. 0 failed.
#edata_keepDEG results and plots
dir.create("edata_keepDEG")
file.move(list.files(pattern = '*edata.csv'), "edata_keepDEG")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'Step10_plot*'), "edata_keepDEG")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_Limma.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step14_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_Simple.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step16_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_edgeR.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step18_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_keepDEG_AllDEGmethods.csv'), "edata_keepDEG")
## 2 files moved. 0 failed.
file.move(list.files(pattern = 'Step22_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*edata_pca_unscaled.csv'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'Step7_plot*'), "edata_keepDEG")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'edata_keepDEG_AllDEGmethods_data'), "edata_keepDEG")
## 1 file moved. 0 failed.
#DEG_Limma results
dir.create("DEG_Limma_results")
file.move(list.files(pattern = '*FC5.00_DEG_Limma.csv'), "DEG_Limma_results")
## 3 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_Limma.csv'), "DEG_Limma_results")
## 3 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_Limma.pdf'), "DEG_Limma_results")
## 5 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Limma_data.RData'), "DEG_Limma_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*Limma_aggregate_data.RData'), "DEG_Limma_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_Limma_temp.RData'), "DEG_Limma_results")
## 1 file moved. 0 failed.
#DEG_Simple results
dir.create("DEG_Simple_results")
file.move(list.files(pattern = '*FC5.00_DEG_Simple.csv'), "DEG_Simple_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_Simple.csv'), "DEG_Simple_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_Simple.pdf'), "DEG_Simple_results")
## 4 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Simple_data.RData'), "DEG_Simple_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*Simple_aggregate_data.RData'), "DEG_Simple_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_Simple_temp.RData'), "DEG_Simple_results")
## 1 file moved. 0 failed.
#DEG_edgeR results
dir.create("DEG_edgeR_results")
file.move(list.files(pattern = '*FC5.00_DEG_edgeR.csv'), "DEG_edgeR_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_edgeR.csv'), "DEG_edgeR_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_edgeR.pdf'), "DEG_edgeR_results")
## 4 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_edgeR_data.RData'), "DEG_edgeR_results")
## 2 files moved. 0 failed.
file.move(list.files(pattern = '*edgeR_aggregate_data.RData'), "DEG_edgeR_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_edgeR_temp.RData'), "DEG_edgeR_results")
## 1 file moved. 0 failed.
#DEG_Cuff results
dir.create("DEG_Cuff_results")
file.move(list.files(pattern = '*FC5.00_DEG_Cuff.csv'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*full_DEG_Cuff.csv'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*_DEG_Cuff.pdf'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Cuff_data.RData'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = '*Cuff_aggregate_data.RData'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_Cuff_temp.RData'), "DEG_Cuff_results")
## 0 files moved. 0 failed.
#DEG_AllDEGmethods results
dir.create("DEG_AllDEGmethods_results")
file.move(list.files(pattern = 'Step21*'), "DEG_AllDEGmethods_results")
## 6 files moved. 0 failed.
file.move(list.files(pattern = '*AllDEGmethods.csv'), "DEG_AllDEGmethods_results")
## 0 files moved. 0 failed.
file.move(list.files(pattern = 'DEG_AllDEGmethods_temp.RData'), "DEG_AllDEGmethods_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = '*AllDEGmethods_aggregate_data.RData'), "DEG_AllDEGmethods_results")
## 1 file moved. 0 failed.
file.move(list.files(pattern = 'DEG_AllDEGmethodsnames.RData'), "DEG_AllDEGmethods_results")
## 1 file moved. 0 failed.
#remove extra files
file.remove(list.files(pattern ='VennDiagram*'))
## [1] TRUE
file.remove(list.files(pattern ='temp.RData'))
## logical(0)
#Remove .RData and clear environment to free up memory
rm(list=ls())
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 4123583 220.3    8073370 431.2  8073370 431.2
## Vcells 7967607  60.8   28223680 215.4 35279601 269.2